#!perl -w
# vim: sw=4 ts=4 et :
# (C) 2003-2007 Willem Jan Hengeveld <itsme@xs4all.nl>
# Web: http://www.xs4all.nl/~itsme/
#      http://wiki.xda-developers.com/
#
# $Id: $
#
# http://en.wikipedia.org/wiki/Linear_regression
# http://mathworld.wolfram.com/LeastSquaresFitting.html
# http://mathworld.wolfram.com/LeastSquaresFittingPolynomial.html
# http://mathworld.wolfram.com/LeastSquaresFittingExponential.html
# http://mathworld.wolfram.com/LeastSquaresFittingPerpendicularOffsets.html
# http://mathworld.wolfram.com/NonlinearLeastSquaresFitting.html
#
#  tests:
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4*exp(0.2*$_)) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*log($_)) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4*$_**0.2) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**2)) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**5)) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**7)) for 1..100' | lsq | sort -n
#
use strict;
use warnings;
use Getopt::Long;
use Math::MatrixReal;
use POSIX;
$|=1;
# simple case: a*x+b=y
#   given many pairs (x,y) find a and b that minimize the error.
#
#  -> a*x0+b=y0
#  -> a*x1+b=y1
#  -> a*x2+b=y2
#
#  -> equation
#   ( x0 1 )  ( a )   ( y0 )
#   ( x1 1 ) *( b ) = ( y1 ) 
#   ( x2 1 )          ( y2 )
#
#      A     * x    =  y
#
#  minimize distance to line:
#    d(line,[xy]) = |A*x-y|
#
#  error :   |A*x-y| = (A*x-y)' * (A*x-y) 
#        = x'*A'*A*x - 2*x*A'*y + y'*y
# differentiate by x
#   ->  2*A'*A*x-2*A'*y = 0  -> A'*A*x=A'*y
#  mini

# 'm' measurements
# 'p' variables
#
# y_0=a0*x_0^0+a1*x_0^1+a2*x_0^2+..an*x_0^n
# y_1=a0*x_1^0+a1*x_1^1+a2*x_1^2+..an*x_1^n
# ..
# y_m=a0*x_m^0+am*x_m^1+a2*x_m^2+..an*x_m^n

# ->
#
# (y_0 .. y_m) = Matrix_rows( [x_0^0, x_0^1, .. x_0^n], [x_1^0 .. x_1^n] ... ) * (a0..an)

#    y=sum(a_i*x^i) find_lsq_poly
#    y=a*exp(b*x)   find_lsq_exponential
#    y=a+b*ln(x)    find_lsq_logarithmic
#    y=a*x^b        find_lsq_powerlaw

# done:   add support for timestamps
#   .. use parsedate to identify timestamps.
#   timestamps can be with, or without date.
# done:   add support for calc yresult for xval
# done:   add support for calc xval for targety
#   .. if list of values starting at col0 follows list, assume that 'y' needs to be calculated
#   .. if list of values starting with TAB follows list, assum that 'x' needs to be calculated.

my ($xcol, $ycol)=(0,1);
my $calclog=1;
my $maxorder=10;
my $minorder=0;
my $outx;
GetOptions(
    "x=s" => \$xcol,
    "y=s" => \$ycol,
    "o=i" => sub { $maxorder= $_[1]+1; $minorder=$_[1]; $calclog=0; },
    "f"   => \$outx,
) or die usage();
sub usage {
    return <<__EOF__
Usage: lsq [-x COL] [-y COL] [-o N]
   -x COL  : specify 'x' column ( first col = 0 )
   -y COL  : specify 'y' column
   -o N    : specify order of polynomial
   -f      : try to move factor inside exponent (a*x)^n
__EOF__
}
my @values;
my @xvalues;
my @yvalues;
my @valueformat;    # -3:date+time, -2:date, -1:time, >=0: decimals
while (<>) {
    next if /^#/;
    my @line= split /[^0-9.e:-]+/, $_;
    next unless @line;
    shift @line if $line[0] eq "";
    for (my $i=0 ; $i<@line ; $i++) {
	    if ($i<$#line && isdate($line[$i]) && istime($line[$i+1])) {
		    $line[$i]= maketimestamp($line[$i], $line[$i+1]);
		    $valueformat[$i]= -3;
		    splice @line, $i+1,1;
	    }
	    elsif (isdate($line[$i])) {
		    $line[$i]= maketimestamp($line[$i], 0);
		    $valueformat[$i]= -2;
	    }
	    elsif (istime($line[$i])) {
		    $line[$i]= maketimestamp(0, $line[$i]);
		    $valueformat[$i]= -1;
	    }
        elsif ($line[$i] =~ /\.(\d+)/ && ( !defined $valueformat[$i] || length($1)>$valueformat[$i])) {
            $valueformat[$i]= length($1);
        }
    }
    if ($line[$xcol] && $line[$ycol]) {
	    if ($line[$xcol] ne "?" && $line[$ycol] ne "?") {
		    push @values, [$line[$xcol], $line[$ycol] ];
	    }
	    elsif ($line[$xcol] ne "?") {
		    push @xvalues, $line[$xcol];
	    }
	    elsif ($line[$ycol] ne "?") {
		    push @yvalues, $line[$ycol];
	    }
    }
    elsif (@line) {
	    if (/^\t/) {
		    push @yvalues, $line[0];
	    }
	    else {
		    push @xvalues, $line[0];
	    }
    }
}
sub isdate {
	return ($_[0] =~ /\d-\d+-\d/);
}
sub istime {
	return ($_[0] =~ /\d:\d/);
}
sub maketimestamp {
	my ($date, $time) = @_;
	my @ymd= $date ? split(/-/, $date) : (0,0,0);
	if ($ymd[0]<100) {
		# y2k fix!
		if ($ymd[0]<50) {
			$ymd[0]+=2000;
		}
		else {
			$ymd[0]+=1900;
		}
	}
	my @hms= $time ? split(/:/, $time) : (0,0,0);
	push @hms,0 if (@hms==2);
	$ymd[0]-=1900;
	$ymd[1]-=1;
	return POSIX::mktime(reverse @ymd,@hms);
}

my @funcs;
# first try polynomials
for (my $order=$minorder ; $order < $maxorder ; $order++) {
    my ($poly, $error)= find_lsq_poly($order, \@values);
    next if (!defined $poly);
    if ($outx) {
        printf("%8f  %s\n", $error,
            join("", map { $_==0 ? sprintf("%5g", $poly->[$_]) :
                $_==1 ? sprintf("%+5g*x", $poly->[$_]) : 
                 sprintf("%s(%5g*x)^%d", $poly->[$_]<0?"-":"+",
                     abs($poly->[$_])**(1/$_), $_) } (0..$#$poly)));
    }
    else {
        printf("%8f  %s\n", $error,
            join("", map { $_==0 ? sprintf("%5g", $poly->[$_]) :
                $_==1 ? sprintf("%+5g*x", $poly->[$_]) : 
                 sprintf("%+5g*x^%d", $poly->[$_], $_) } (0..$#$poly)));
    }

	push @funcs, sub { my $res=0; for (my $i=$#$poly ; $i>=0 ; $i--) { $res *= $_[0]; $res += $poly->[$i]; } return $res; };
}
if ($calclog) {
    {
        my ($ab, $err)= find_lsq_exponential(\@values);
        if ($ab) {
            printf("%8f  %5g * exp( %5g * x )\n", $err, $ab->[0], $ab->[1]);
            push @funcs, sub { return $ab->[0]*exp($_[0]*$ab->[1]); };
        }
    }
    {
        my ($ab, $err)= find_lsq_logarithmic(\@values);
        if ($ab) {
            printf("%8f  %5g + %5g * log( x )\n", $err, $ab->[0], $ab->[1]);
            push @funcs, sub { if ($_[0]<=0) { return undef; } return $ab->[0]+log($_[0])*$ab->[1]; };
        }
    }

    {
        my ($ab, $err)= find_lsq_powerlaw(\@values);
        if ($ab) {
            if ($outx) {
                printf("%8f %s( %5g * x ) ^ %5g\n", $err, 
                    $ab->[0]<0?"-":"+",
                    abs($ab->[0])**(1/$ab->[1]), $ab->[1]);
            }
            else {
                printf("%8f  %5g * x ^ %5g\n", $err, $ab->[0], $ab->[1]);
            }
            push @funcs, sub { return $ab->[0]*$_[0]**$ab->[1]; };
        }
    }
}
for my $xval (@xvalues) {
    printf(" %s", formatvalue($xval, $valueformat[$xcol]));
    for my $f (@funcs) {
        printf(" %s", formatvalue($f->($xval), $valueformat[$ycol]));
    }
    print "\n";
}
for my $yval (@yvalues) {
    printf(" %s", formatvalue($yval, $valueformat[$ycol]));;
    for my $f (@funcs) {
        my $xval= findinverse($f, $yval);
        printf(" %s", formatvalue($xval, $valueformat[$xcol]));;
    }
    print "\n";
}
sub formatvalue {
    my ($val, $ts)= @_;
    $ts ||= 0;
    if (!defined $val) {
        return "undef";
    }
    if ($ts>=0) {
        return sprintf("%.*f", $ts, $val);
    }
    else {
        my @tv= localtime $val;
        $tv[4]+=1;
        $tv[5]+=1900;
        if ($ts==-1) {
            return sprintf("%02d:%02d:%02d", reverse @tv[0..2]);
        }
        elsif ($ts==-2) {
            return sprintf("%04d-%02d-%02d", reverse @tv[3..5]);
        }
        elsif ($ts==-3) {
            return sprintf("%04d-%02d-%02d %02d:%02d:%02d", reverse @tv[0..5]);
        }
    }
}
sub findslope {
    my ($f, $yval, $x)= @_;
    my $x0= $x;
    my $y0= $f->($x0)-$yval;
    my $x0d= $x0+0.0001;
    my $y0d= eval { $f->($x0d)-$yval; };
    if (!defined $y0d || $@) {
        $x0d= $x0-0.0001;
        $y0d= eval { $f->($x0d)-$yval; };
        if (!defined $y0d || $@) {
            return undef;
        }
    }
    return ($y0d-$y0)/($x0d-$x0);
}
sub findinverse {
    my ($f, $yval)= @_;
    # todo: calc epsilon based on min/max x-value.
    my $x0= $values[0][0];

    my $iter=0;
    while (1) {
        my $s= findslope($f, $yval, $x0);
        if (!defined $s || $s==0) {
            return undef;
        }
        my $y0= $f->($x0)-$yval;
        # y0+(x-x0)*s = 0 -> y0+x*s-x0*s -> x=x0-y0/s
        my $x1= $x0-$y0/$s;
        last if abs($x1-$x0)<0.0001;
        last if ($iter++>100);
        $x0= $x1;
    }

    return $x0;
}
sub find_lsq_poly {
    my ($n, $values)= @_;
# (y_0 .. y_m) = Matrix_rows( [x_0^0, x_0^1, .. x_0^n], [x_1^0 .. x_1^n] ... ) * (a0..an)

    if ( $#$values-$n-1 <= 0 ) {
	    return undef;
    }
    my @a;
    my @b;
    for (my $row=0 ; $row<=$#$values ; $row++) {
        push @b, $values->[$row][1];
        for (my $col=0 ; $col<=$n ; $col++) {
            $a[$row][$col]= $values->[$row][0]**$col;
        }
    }
    return solve_lsq(\@a, \@b);
}
#    y=a*exp(b*x) -> solve ln(y) = ln(a) + b*x
#    ( 1  x0 )   ( ln(a) )    ( ln(y0) )
#    ( 1  x1 ) * (   b   )  = ( ln(y1) )
#    ( 1  x2 )                ( ln(y2) )
#
#    todo: how to calculate  err = y-a*exp(b*x)
#       given   |ln(y)-a-b*x|
sub find_lsq_exponential {
    my ($values)= @_;
    my @a;
    my @b;
    for (my $row=0 ; $row<=$#$values ; $row++) {
        push @b, log($values->[$row][1]);
        $a[$row][0]= 1;
        $a[$row][1]= $values->[$row][0];
    }
    my ($ab, $err)= solve_lsq(\@a, \@b);
    return undef if (!defined $ab);
#   my $err2=0;
#   $err2 += ($_->[1]-exp($ab->[0])*exp($ab->[1]*$_->[0]))**2 for @$values;
#   printf("err1 : %f   err2 : %f\n", $err, sqrt($err2/($#$values-3)));
    return [ exp($ab->[0]), $ab->[1] ], $err;
}
#    y=a+b*ln(x)
sub find_lsq_logarithmic {
    my ($values)= @_;
    my @a;
    my @b;
    for (my $row=0 ; $row<=$#$values ; $row++) {
        push @b, $values->[$row][1];
        $a[$row][0]= 1;
        $a[$row][1]= log($values->[$row][0]);
    }
    return solve_lsq(\@a, \@b);
}
#    y=a*x^b  ->  ln(y) = ln(a) + b*ln(x)
sub find_lsq_powerlaw {
    my ($values)= @_;
    my @a;
    my @b;
    for (my $row=0 ; $row<=$#$values ; $row++) {
        push @b, log($values->[$row][1]);
        $a[$row][0]= 1;
        $a[$row][1]= log($values->[$row][0]);
    }
    my ($ab, $err)= solve_lsq(\@a, \@b);
    return undef if (!defined $ab);
    return [ exp($ab->[0]), $ab->[1] ], $err;
}

sub solve_lsq {
    my ($m_a, $v_b)= @_;
    return undef if (@{$m_a}-@{$m_a->[0]}-1<=0);
    my $A= Math::MatrixReal->new_from_rows($m_a);
    my $b= Math::MatrixReal->new_from_columns([$v_b]);
    my $Ab= (~$A) * $b;
    my $AA = (~$A) * $A;

    my $LR= $AA->decompose_LR();
    my ($dim,$x,$B) = $LR->solve_LR($Ab);

    if (!defined $x) {
	    printf("NO SOLUTION!\n");
	    return undef;
    }
    my $Axb= $A*$x-$b;
    my $error= (~$Axb)*$Axb;

    #printf("a : %d x %d -  b : %d  err:%f\n", scalar @$m_a, scalar @{$m_a->[0]}, scalar @$v_b, sqrt($error->element(1,1)/(@{$m_a}-@{$m_a->[0]}-1)));
    return [ map { $x->element($_,1) } (1..@{$m_a->[0]}) ], sqrt($error->element(1,1)/(@{$m_a}-@{$m_a->[0]}-1));
}
