#!perl -w # vim: sw=4 ts=4 et : # (C) 2003-2007 Willem Jan Hengeveld # Web: http://www.xs4all.nl/~itsme/ # http://wiki.xda-developers.com/ # # $Id: $ # # http://en.wikipedia.org/wiki/Linear_regression # http://mathworld.wolfram.com/LeastSquaresFitting.html # http://mathworld.wolfram.com/LeastSquaresFittingPolynomial.html # http://mathworld.wolfram.com/LeastSquaresFittingExponential.html # http://mathworld.wolfram.com/LeastSquaresFittingPerpendicularOffsets.html # http://mathworld.wolfram.com/NonlinearLeastSquaresFitting.html # # tests: # perl -e 'printf("%5.2f %5.2f\n", $_, 0.4*exp(0.2*$_)) for 1..100' | lsq | sort -n # perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*log($_)) for 1..100' | lsq | sort -n # perl -e 'printf("%5.2f %5.2f\n", $_, 0.4*$_**0.2) for 1..100' | lsq | sort -n # perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**2)) for 1..100' | lsq | sort -n # perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**5)) for 1..100' | lsq | sort -n # perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**7)) for 1..100' | lsq | sort -n # use strict; use warnings; use Getopt::Long; use Math::MatrixReal; use POSIX; $|=1; # simple case: a*x+b=y # given many pairs (x,y) find a and b that minimize the error. # # -> a*x0+b=y0 # -> a*x1+b=y1 # -> a*x2+b=y2 # # -> equation # ( x0 1 ) ( a ) ( y0 ) # ( x1 1 ) *( b ) = ( y1 ) # ( x2 1 ) ( y2 ) # # A * x = y # # minimize distance to line: # d(line,[xy]) = |A*x-y| # # error : |A*x-y| = (A*x-y)' * (A*x-y) # = x'*A'*A*x - 2*x*A'*y + y'*y # differentiate by x # -> 2*A'*A*x-2*A'*y = 0 -> A'*A*x=A'*y # mini # 'm' measurements # 'p' variables # # y_0=a0*x_0^0+a1*x_0^1+a2*x_0^2+..an*x_0^n # y_1=a0*x_1^0+a1*x_1^1+a2*x_1^2+..an*x_1^n # .. # y_m=a0*x_m^0+am*x_m^1+a2*x_m^2+..an*x_m^n # -> # # (y_0 .. y_m) = Matrix_rows( [x_0^0, x_0^1, .. x_0^n], [x_1^0 .. x_1^n] ... ) * (a0..an) # y=sum(a_i*x^i) find_lsq_poly # y=a*exp(b*x) find_lsq_exponential # y=a+b*ln(x) find_lsq_logarithmic # y=a*x^b find_lsq_powerlaw # todo: add support for timestamps # .. use parsedate to identify timestamps. # timestamps can be with, or without date. # todo: add support for calc yresult for xval # todo: add support for calc xval for targety # .. if list of values starting at col0 follows list, assume that 'y' needs to be calculated # .. if list of values starting with TAB follows list, assum that 'x' needs to be calculated. my ($xcol, $ycol)=(0,1); my $calclog=1; my $maxorder=10; my $minorder=0; GetOptions( "x=s" => \$xcol, "y=s" => \$ycol, "o=i" => sub { $maxorder= $_[1]+1; $minorder=$_[1]; $calclog=0; }, ); my @values; my @xvalues; my @yvalues; my @valueformat; # -3:date+time, -2:date, -1:time, >=0: decimals while (<>) { my @line= split /[^0-9.e:-]+/, $_; next unless @line; shift @line if $line[0] eq ""; for (my $i=0 ; $i<@line ; $i++) { if ($i<$#line && isdate($line[$i]) && istime($line[$i+1])) { $line[$i]= maketimestamp($line[$i], $line[$i+1]); $valueformat[$i]= -3; splice @line, $i+1,1; } elsif (isdate($line[$i])) { $line[$i]= maketimestamp($line[$i], 0); $valueformat[$i]= -2; } elsif (istime($line[$i])) { $line[$i]= maketimestamp(0, $line[$i]); $valueformat[$i]= -1; } elsif ($line[$i] =~ /\.(\d+)/ && ( !defined $valueformat[$i] || length($1)>$valueformat[$i])) { $valueformat[$i]= length($1); } } if ($line[$xcol] && $line[$ycol]) { if ($line[$xcol] ne "?" && $line[$ycol] ne "?") { push @values, [$line[$xcol], $line[$ycol] ]; } elsif ($line[$xcol] ne "?") { push @xvalues, $line[$xcol]; } elsif ($line[$ycol] ne "?") { push @yvalues, $line[$ycol]; } } elsif (@line) { if (/^\t/) { push @yvalues, $line[0]; } else { push @xvalues, $line[0]; } } } sub isdate { return ($_[0] =~ /\d-\d+-\d/); } sub istime { return ($_[0] =~ /\d:\d/); } sub maketimestamp { my ($date, $time) = @_; my @ymd= $date ? split(/-/, $date) : (0,0,0); if ($ymd[0]<100) { # y2k fix! if ($ymd[0]<50) { $ymd[0]+=2000; } else { $ymd[0]+=1900; } } my @hms= $time ? split(/:/, $time) : (0,0,0); push @hms,0 if (@hms==2); $ymd[0]-=1900; $ymd[1]-=1; return POSIX::mktime(reverse @ymd,@hms); } my @funcs; # first try polynomials for (my $order=$minorder ; $order < $maxorder ; $order++) { my ($poly, $error)= find_lsq_poly($order, \@values); next if (!defined $poly); printf("%8f %s\n", $error, join("", map { $_==0 ? sprintf("%5g", $poly->[$_]) : $_==1 ? sprintf("%+5g*x", $poly->[$_]) : sprintf("%+5g*x^%d", $poly->[$_], $_) } (0..$#$poly))); push @funcs, sub { my $res=0; for (my $i=$#$poly ; $i>=0 ; $i--) { $res *= $_[0]; $res += $poly->[$i]; } return $res; }; } if ($calclog) { { my ($ab, $err)= find_lsq_exponential(\@values); if ($ab) { printf("%8f %5g * exp( %5g * x )\n", $err, $ab->[0], $ab->[1]); push @funcs, sub { return $ab->[0]*exp($_[0]*$ab->[1]); }; } } { my ($ab, $err)= find_lsq_logarithmic(\@values); if ($ab) { printf("%8f %5g + %5g * log( x )\n", $err, $ab->[0], $ab->[1]); push @funcs, sub { if ($_[0]<=0) { return undef; } return $ab->[0]+log($_[0])*$ab->[1]; }; } } { my ($ab, $err)= find_lsq_powerlaw(\@values); if ($ab) { printf("%8f %5g * x ^ %5g\n", $err, $ab->[0], $ab->[1]); push @funcs, sub { return $ab->[0]*$_[0]**$ab->[1]; }; } } } for my $xval (@xvalues) { printf(" %s", formatvalue($xval, $valueformat[$xcol])); for my $f (@funcs) { printf(" %s", formatvalue($f->($xval), $valueformat[$ycol])); } print "\n"; } for my $yval (@yvalues) { printf(" %s", formatvalue($yval, $valueformat[$ycol]));; for my $f (@funcs) { my $xval= findinverse($f, $yval); printf(" %s", formatvalue($xval, $valueformat[$xcol]));; } print "\n"; } sub formatvalue { my ($val, $ts)= @_; $ts ||= 0; if (!defined $val) { return "undef"; } if ($ts>=0) { return sprintf("%.*f", $ts, $val); } else { my @tv= localtime $val; $tv[4]+=1; $tv[5]+=1900; if ($ts==-1) { return sprintf("%02d:%02d:%02d", reverse @tv[0..2]); } elsif ($ts==-2) { return sprintf("%04d-%02d-%02d", reverse @tv[3..5]); } elsif ($ts==-3) { return sprintf("%04d-%02d-%02d %02d:%02d:%02d", reverse @tv[0..5]); } } } sub findslope { my ($f, $yval, $x)= @_; my $x0= $x; my $y0= $f->($x0)-$yval; my $x0d= $x0+0.0001; my $y0d= eval { $f->($x0d)-$yval; }; if (!defined $y0d || $@) { $x0d= $x0-0.0001; $y0d= eval { $f->($x0d)-$yval; }; if (!defined $y0d || $@) { return undef; } } return ($y0d-$y0)/($x0d-$x0); } sub findinverse { my ($f, $yval)= @_; # todo: calc epsilon based on min/max x-value. my $x0= $values[0][0]; my $iter=0; while (1) { my $s= findslope($f, $yval, $x0); if (!defined $s || $s==0) { return undef; } my $y0= $f->($x0)-$yval; # y0+(x-x0)*s = 0 -> y0+x*s-x0*s -> x=x0-y0/s my $x1= $x0-$y0/$s; last if abs($x1-$x0)<0.0001; last if ($iter++>100); $x0= $x1; } return $x0; } sub find_lsq_poly { my ($n, $values)= @_; # (y_0 .. y_m) = Matrix_rows( [x_0^0, x_0^1, .. x_0^n], [x_1^0 .. x_1^n] ... ) * (a0..an) if ( $#$values-$n-1 <= 0 ) { return undef; } my @a; my @b; for (my $row=0 ; $row<=$#$values ; $row++) { push @b, $values->[$row][1]; for (my $col=0 ; $col<=$n ; $col++) { $a[$row][$col]= $values->[$row][0]**$col; } } return solve_lsq(\@a, \@b); } # y=a*exp(b*x) -> solve ln(y) = ln(a) + b*x # ( 1 x0 ) ( ln(a) ) ( ln(y0) ) # ( 1 x1 ) * ( b ) = ( ln(y1) ) # ( 1 x2 ) ( ln(y2) ) # # todo: how to calculate err = y-a*exp(b*x) # given |ln(y)-a-b*x| sub find_lsq_exponential { my ($values)= @_; my @a; my @b; for (my $row=0 ; $row<=$#$values ; $row++) { push @b, log($values->[$row][1]); $a[$row][0]= 1; $a[$row][1]= $values->[$row][0]; } my ($ab, $err)= solve_lsq(\@a, \@b); return undef if (!defined $ab); # my $err2=0; # $err2 += ($_->[1]-exp($ab->[0])*exp($ab->[1]*$_->[0]))**2 for @$values; # printf("err1 : %f err2 : %f\n", $err, sqrt($err2/($#$values-3))); return [ exp($ab->[0]), $ab->[1] ], $err; } # y=a+b*ln(x) sub find_lsq_logarithmic { my ($values)= @_; my @a; my @b; for (my $row=0 ; $row<=$#$values ; $row++) { push @b, $values->[$row][1]; $a[$row][0]= 1; $a[$row][1]= log($values->[$row][0]); } return solve_lsq(\@a, \@b); } # y=a*x^b -> ln(y) = ln(a) + b*ln(x) sub find_lsq_powerlaw { my ($values)= @_; my @a; my @b; for (my $row=0 ; $row<=$#$values ; $row++) { push @b, log($values->[$row][1]); $a[$row][0]= 1; $a[$row][1]= log($values->[$row][0]); } my ($ab, $err)= solve_lsq(\@a, \@b); return undef if (!defined $ab); return [ exp($ab->[0]), $ab->[1] ], $err; } sub solve_lsq { my ($m_a, $v_b)= @_; return undef if (@{$m_a}-@{$m_a->[0]}-1<=0); my $A= Math::MatrixReal->new_from_rows($m_a); my $b= Math::MatrixReal->new_from_columns([$v_b]); my $Ab= (~$A) * $b; my $AA = (~$A) * $A; my $LR= $AA->decompose_LR(); my ($dim,$x,$B) = $LR->solve_LR($Ab); if (!defined $x) { printf("NO SOLUTION!\n"); return undef; } my $Axb= $A*$x-$b; my $error= (~$Axb)*$Axb; #printf("a : %d x %d - b : %d err:%f\n", scalar @$m_a, scalar @{$m_a->[0]}, scalar @$v_b, sqrt($error->element(1,1)/(@{$m_a}-@{$m_a->[0]}-1))); return [ map { $x->element($_,1) } (1..@{$m_a->[0]}) ], sqrt($error->element(1,1)/(@{$m_a}-@{$m_a->[0]}-1)); }