diff options
author | Lorry Tar Creator <lorry-tar-importer@baserock.org> | 2012-09-24 10:15:50 +0000 |
---|---|---|
committer | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-09-26 13:46:46 +0000 |
commit | 485b97be9f2f2abf5a40923b5fd85f75714a8c02 (patch) | |
tree | ca05cb0ecf3828d909a898c3e5805804a0aff5f8 /lib/DBD/SQLite | |
download | perl-dbd-sqlite-tarball-master.tar.gz |
Imported from /srv/lorry/lorry-area/perl-dbd-sqlite-tarball/DBD-SQLite-1.38_01.tar.gz.HEADDBD-SQLite-1.38_01masterbaserock/morph
Diffstat (limited to 'lib/DBD/SQLite')
-rw-r--r-- | lib/DBD/SQLite/Cookbook.pod | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/lib/DBD/SQLite/Cookbook.pod b/lib/DBD/SQLite/Cookbook.pod new file mode 100644 index 0000000..8b40c5b --- /dev/null +++ b/lib/DBD/SQLite/Cookbook.pod @@ -0,0 +1,194 @@ +=head1 NAME + +DBD::SQLite::Cookbook - The DBD::SQLite Cookbook + +=head1 DESCRIPTION + +This is the L<DBD::SQLite> cookbook. + +It is intended to provide a place to keep a variety of functions and +formals for use in callback APIs in L<DBD::SQLite>. + +=head1 AGGREGATE FUNCTIONS + +=head2 Variance + +This is a simple aggregate function which returns a variance. It is +adapted from an example implementation in pysqlite. + + package variance; + + sub new { bless [], shift; } + + sub step { + my ( $self, $value ) = @_; + + push @$self, $value; + } + + sub finalize { + my $self = $_[0]; + + my $n = @$self; + + # Variance is NULL unless there is more than one row + return undef unless $n || $n == 1; + + my $mu = 0; + foreach my $v ( @$self ) { + $mu += $v; + } + $mu /= $n; + + my $sigma = 0; + foreach my $v ( @$self ) { + $sigma += ($v - $mu)**2; + } + $sigma = $sigma / ($n - 1); + + return $sigma; + } + + # NOTE: If you use an older DBI (< 1.608), + # use $dbh->func(..., "create_aggregate") instead. + $dbh->sqlite_create_aggregate( "variance", 1, 'variance' ); + +The function can then be used as: + + SELECT group_name, variance(score) + FROM results + GROUP BY group_name; + +=head2 Variance (Memory Efficient) + +A more efficient variance function, optimized for memory usage at the +expense of precision: + + package variance2; + + sub new { bless {sum => 0, count=>0, hash=> {} }, shift; } + + sub step { + my ( $self, $value ) = @_; + my $hash = $self->{hash}; + + # by truncating and hashing, we can comsume many more data points + $value = int($value); # change depending on need for precision + # use sprintf for arbitrary fp precision + if (exists $hash->{$value}) { + $hash->{$value}++; + } else { + $hash->{$value} = 1; + } + $self->{sum} += $value; + $self->{count}++; + } + + sub finalize { + my $self = $_[0]; + + # Variance is NULL unless there is more than one row + return undef unless $self->{count} > 1; + + # calculate avg + my $mu = $self->{sum} / $self->{count}; + + my $sigma = 0; + while (my ($h, $v) = each %{$self->{hash}}) { + $sigma += (($h - $mu)**2) * $v; + } + $sigma = $sigma / ($self->{count} - 1); + + return $sigma; + } + +The function can then be used as: + + SELECT group_name, variance2(score) + FROM results + GROUP BY group_name; + +=head2 Variance (Highly Scalable) + +A third variable implementation, designed for arbitrarily large data sets: + + package variance3; + + sub new { bless {mu=>0, count=>0, S=>0}, shift; } + + sub step { + my ( $self, $value ) = @_; + $self->{count}++; + my $delta = $value - $self->{mu}; + $self->{mu} += $delta/$self->{count}; + $self->{S} += $delta*($value - $self->{mu}); + } + + sub finalize { + my $self = $_[0]; + return $self->{S} / ($self->{count} - 1); + } + +The function can then be used as: + + SELECT group_name, variance3(score) + FROM results + GROUP BY group_name; + +=head1 FTS3 fulltext indexing + +=head2 Sparing database disk space + +As explained in L<http://www.sqlite.org/fts3.html#section_6>, each +FTS3 table C<I<t>> is stored internally within three regular tables +C<I<t>_content>, C<I<t>_segments> and C<I<t>_segdir>. The last two +tables contain the fulltext index. The first table C<I<t>_content> +stores the complete documents being indexed ... but if copies of the +same documents are already stored somewhere else, or can be computed +from external resources (for example as HTML or MsWord files in the +filesystem), then this is quite a waste of space. SQLite itself only +needs the C<I<t>_content> table for implementing the C<offsets()> and +C<snippet()> functions, which are not always usable anyway (in particular +when using utf8 characters greater than 255). + +So an alternative strategy is to use SQLite only for the fulltext +index and metadata, and to keep the full documents outside of SQLite : +to do so, after each insert or update in the FTS3 table, do an update +in the C<I<t>_content> table, setting the content column(s) to +NULL. Of course your application will need an algorithm for finding +the external resource corresponding to any I<docid> stored within +SQLite. Furthermore, SQLite C<offsets()> and C<snippet()> functions +cannot be used, so if such functionality is needed, it has to be +directly programmed within the Perl application. +In short, this strategy is really a hack, because FTS3 was not originally +programmed with that behaviour in mind; however it is workable +and has a strong impact on the size of the database file. + +=head1 SUPPORT + +Bugs should be reported via the CPAN bug tracker at + +L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DBD-SQLite> + +=head1 TO DO + +* Add more and varied cookbook recipes, until we have enough to +turn them into a separate CPAN distribution. + +* Create a series of tests scripts that validate the cookbook recipies. + +=head1 AUTHOR + +Adam Kennedy E<lt>adamk@cpan.orgE<gt> + +Laurent Dami E<lt>dami@cpan.orgE<gt> + +=head1 COPYRIGHT + +Copyright 2009 - 2012 Adam Kennedy. + +This program is free software; you can redistribute +it and/or modify it under the same terms as Perl itself. + +The full text of the license can be found in the +LICENSE file included with this module. |