daily pastebin goal
17%
SHARE
TWEET

load_erecords.pl

Dyrcona Nov 8th, 2018 145 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/perl
  2. # ---------------------------------------------------------------
  3. # Copyright © 2016 C/W MARS, Inc.
  4. # Jason Stephenson <jstephenson@cwmars.org>
  5. #
  6. # This program is free software; you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation; either version 2 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. # GNU General Public License for more details.
  15. # ---------------------------------------------------------------
  16.  
  17. use strict;
  18. use warnings;
  19. use feature qw/state/;
  20. use Getopt::Long;
  21. use MARC::Record;
  22. use MARC::File::XML (BinaryEncoding => 'utf8');
  23. use OpenILS::Utils::Normalize qw(clean_marc naco_normalize);
  24. use IO::File;
  25. use DateTime;
  26. use DateTime::TimeZone;
  27. use Time::HiRes qw/tv_interval gettimeofday/;
  28. use DBI;
  29.  
  30. IO::File->input_record_separator("\x1E\x1D");
  31. IO::File->output_record_separator("\n");
  32.  
  33. # options with defaults:
  34. my $db_user = $ENV{PGUSER} || 'evergreen';
  35. my $db_host = $ENV{PGHOST} || 'evergreen';
  36. my $db_db = $ENV{PGDATABASE} || 'evergreen';
  37. my $db_password = $ENV{PGPASSWORD} || 'evergreen';
  38. my $db_port = $ENV{PGPORT} || 5432;
  39. my $source;
  40. my $strict;
  41. my $timing;
  42.  
  43. GetOptions("user=s" => \$db_user,
  44.            "host=s" => \$db_host,
  45.            "db=s" => \$db_db,
  46.            "password=s" => \$db_password,
  47.            "port=i" => \$db_port,
  48.            "source=s" => \$source,
  49.            timing => \$timing,
  50.            strict => \$strict) or die("Error in command line options");
  51.  
  52. my $dbh = DBI->connect("dbi:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=loaderecords",
  53.                        $db_user, $db_password,
  54.                        {PrintError => 0, RaiseError => 1, AutoCommit => 1})
  55.     or die("No database connection.");
  56.  
  57. die("Must specify --source option.") unless ($source);
  58.  
  59. $source = lookup_source($source);
  60.  
  61. die("--source is not valid.") unless ($source);
  62.  
  63. my $mapper = MARCFixedFieldMapper->new();
  64.  
  65. my ($rej, $exc); # Variables for reject and exception file handles. We only open this if necessary.
  66. my $error_count = 0; # Count of errors.
  67.  
  68. # Because this can produce lots of output, we're writing progress messages to a log file instead of standard output.
  69. my $log = IO::File->new("> log.txt");
  70.  
  71. foreach my $input_file (@ARGV) {
  72.     my $count = 0;
  73.     my $fh = IO::File->new("< $input_file");
  74.     my $str = date_str($input_file, 1);
  75.     print("$str"); # For running from at, etc., so we have something in the email to let us know when it is done.
  76.     $log->print($str);
  77.     while (my $raw = <$fh>) {
  78.         $count++;
  79.         eval {
  80.             my ($match_start, $match_end, $update_start, $update_end, $message);
  81.             my $record = MARC::Record->new_from_usmarc($raw);
  82.             my @warnings = $record->warnings();
  83.             $match_start = [gettimeofday()];
  84.             my $match = find_best_match($record);
  85.             $match_end = [gettimeofday()];
  86.             if ($match) {
  87.                 no warnings qw(uninitialized);
  88.                 my $update_needed = 0;
  89.                 $message = "$input_file $count matches " . $match->{id} . " with score " . $match->{score};
  90.                 $message .= " in " . tv_interval($match_start, $match_end) . " seconds" if ($timing);
  91.                 $log->print($message);
  92.                 foreach my $nfield ($record->field('856')) {
  93.                     my $add = 1;
  94.                     foreach my $ofield ($match->{marc}->field('856')) {
  95.                         if ($nfield->subfield('9') eq $ofield->subfield('9') && $nfield->subfield('u')
  96.                                 eq $ofield->subfield('u')) {
  97.                             $add = 0;
  98.                             last;
  99.                         }
  100.                     }
  101.                     if ($add) {
  102.                         $match->{marc}->insert_fields_ordered($nfield);
  103.                         $update_needed++;
  104.                     }
  105.                 }
  106.                 if ($update_needed) {
  107.                     $update_start = [gettimeofday()];
  108.                     my $success = update_marc($match);
  109.                     $update_end = [gettimeofday()];
  110.                     if ($success == 0) { # man DBI and look for the execute statement handle description for why.
  111.                         $message = "$input_file $count update of record " . $match->{id} . " failed";
  112.                     } else {
  113.                         $message = "$input_file $count added $update_needed URL(s) to record " . $match->{id};
  114.                     }
  115.                     $message .= " in " . tv_interval($update_start, $update_end) . " seconds" if ($timing);
  116.                     $log->print($message);
  117.                 } else {
  118.                     $log->print("$input_file $count URL tag exists in " . $match->{id});
  119.                 }
  120.             } else {
  121.                 if ($timing) {
  122.                     $log->print("$input_file $count did not match in " . tv_interval($match_start, $match_end) . " seconds");
  123.                 }
  124.                 if (@warnings) {
  125.                     if ($strict) {
  126.                         die("@warnings");
  127.                     } else {
  128.                         $log->print("$input_file $count @warnings");
  129.                     }
  130.                 }
  131.                 $update_start = [gettimeofday()];
  132.                 my $id = insert_marc($source, $record);
  133.                 $update_end = [gettimeofday()];
  134.                 if ($id) {
  135.                     $message = "$input_file $count inserted as bre.id $id";
  136.                 } else {
  137.                     $message = "$input_file $count failed to insert";
  138.                 }
  139.                 $message .= " in " . tv_interval($update_start, $update_end) . " seconds" if ($timing);
  140.                 $log->print($message);
  141.             }
  142.         };
  143.         if ($@) {
  144.             my $error = $@;
  145.             $error =~ s/\s+$//;
  146.             $error_count++;
  147.             unless ($rej) {
  148.                 $rej = IO::File->new("> skipped_bibs.mrc");
  149.                 $rej->binmode(':raw');
  150.             }
  151.             unless ($exc) {
  152.                 $exc = IO::File->new("> exceptions.txt");
  153.             }
  154.             { local $\; # Just makin' sure.
  155.               $rej->print($raw); }
  156.             { local $\ = "\cM\cJ";
  157.               $exc->print("Record $error_count: $error"); }
  158.             $log->print("$input_file $count $error");
  159.         }
  160.     }
  161.     $fh->close();
  162.     $str = date_str($input_file, 0);
  163.     print("$str"); # For running from at, etc., so we have something in the email to let us know when it is done.
  164.     $log->print($str);
  165. }
  166.  
  167. END {
  168.     $dbh->disconnect() if ($dbh);
  169.     if ($log && $log->opened()) {
  170.         $log->close();
  171.     }
  172.     if ($rej && $rej->opened()) {
  173.         $rej->close();
  174.     }
  175.     if ($exc && $exc->opened()) {
  176.         $exc->close();
  177.     }
  178. }
  179.  
  180. sub find_best_match {
  181.     my $record = shift;
  182.  
  183.     my $id_matches = get_identifier_matches($record);
  184.     my $isbn_matches = get_isbn_matches($record);
  185.  
  186.     if ($id_matches || $isbn_matches) {
  187.         my %merged;
  188.         if ($id_matches && $isbn_matches) {
  189.             %merged = %$id_matches;
  190.             foreach my $k (keys %$isbn_matches) {
  191.                 if ($merged{$k}) {
  192.                     $merged{$k}->{score} += $isbn_matches->{$k}->{score};
  193.                 } else {
  194.                     $merged{$k} = $isbn_matches->{$k};
  195.                 }
  196.             }
  197.         } elsif ($id_matches) {
  198.             %merged = %$id_matches;
  199.         } else {
  200.             %merged = %$isbn_matches;
  201.         }
  202.  
  203.         my @results = sort {$b->{score} <=> $a->{score}} sort {$b->{id} <=> $a->{id}} values %merged;
  204.         my $data = $results[0];
  205.         $data->{marc} = MARC::Record->new_from_xml($data->{marc}) if ($data && ref($data) eq 'HASH' && $data->{marc});
  206.         return $data;
  207.     }
  208.  
  209.     return undef;
  210. }
  211.  
  212. sub get_identifier_matches {
  213.     my $record = shift;
  214.  
  215.     state $sth = $dbh->prepare(<<'EOQ'
  216. select bre.id, bre.marc, 2 as score
  217. from biblio.record_entry bre
  218. join metabib.record_attr_vector_list mravl on mravl.source = bre.id
  219. join config.coded_value_map itype on idx(mravl.vlist, itype.id) > 0
  220. and itype.ctype = 'item_type' and itype.code = $1
  221. join config.coded_value_map iform on idx(mravl.vlist, iform.id) > 0
  222. and iform.ctype = 'item_form' and iform.code = $2
  223. join metabib.real_full_rec identifier on identifier.record = bre.id
  224. and identifier.tag = '035'
  225. and identifier.subfield = 'a'
  226. and identifier.value = any($3)
  227. where not bre.deleted
  228. EOQ
  229.     );
  230.  
  231.     $sth->bind_param(1, $mapper->type($record));
  232.     $sth->bind_param(2, $mapper->form($record));
  233.     $sth->bind_param(3, prepare_identifiers($record));
  234.     if ($sth->execute()) {
  235.         my $data = $sth->fetchall_hashref('id');
  236.         if ($data && %$data) {
  237.             return $data;
  238.         }
  239.     }
  240.  
  241.     return undef;
  242. }
  243.  
  244. sub get_isbn_matches {
  245.     my $record = shift;
  246.  
  247.     my $isbn_query = prepare_isbns($record);
  248.  
  249.     state $sth = $dbh->prepare(<<'EOQ'
  250. select bre.id, bre.marc, 1 as score
  251. from biblio.record_entry bre
  252. join metabib.record_attr_vector_list mravl on mravl.source = bre.id
  253. join config.coded_value_map itype on idx(mravl.vlist, itype.id) > 0
  254. and itype.ctype = 'item_type' and itype.code = $1
  255. join config.coded_value_map iform on idx(mravl.vlist, iform.id) > 0
  256. and iform.ctype = 'item_form' and iform.code = $2
  257. join metabib.real_full_rec isbn on isbn.record = bre.id
  258. and isbn.tag = '020'
  259. and isbn.subfield = 'a'
  260. and index_vector @@ $3
  261. where not bre.deleted
  262. EOQ
  263.     );
  264.  
  265.     if ($isbn_query) {
  266.         $sth->bind_param(1, $mapper->type($record));
  267.         $sth->bind_param(2, $mapper->form($record));
  268.         $sth->bind_param(3, $isbn_query);
  269.         if ($sth->execute()) {
  270.             my $data = $sth->fetchall_hashref('id');
  271.             if ($data && %$data) {
  272.                 return $data;
  273.             }
  274.         }
  275.     }
  276.  
  277.     return undef;
  278. }
  279.  
  280. sub prepare_identifiers {
  281.     my $record = shift;
  282.     my $out = [];
  283.  
  284.     my @fields = $record->field('035');
  285.     foreach my $field (@fields) {
  286.         my $str = $field->subfield('a');
  287.         push(@$out, naco_normalize($str, 'a')) if ($str);
  288.     }
  289.     return $out;
  290. }
  291.  
  292. sub prepare_isbns {
  293.     my $record = shift;
  294.     my @isbns = ();
  295.     my @fields = $record->field('020');
  296.     foreach my $field (@fields) {
  297.         my $isbn = $field->subfield('a');
  298.         next unless($isbn);
  299.         $isbn = naco_normalize($isbn, 'a');
  300.         my $idx = index($isbn, ' ');
  301.         $isbn = substr($isbn, 0, $idx) if ($idx != -1);
  302.         push(@isbns, $isbn) unless (grep {$_ eq $isbn} @isbns);
  303.     }
  304.     return join(' | ', @isbns);
  305. }
  306.  
  307. sub lookup_source {
  308.     my $source = shift;
  309.     if ($source =~ /^\d+$/) {
  310.         # check that this is a valid source id.
  311.         my $data = $dbh->selectall_arrayref("select source from config.bib_source where id = $source");
  312.         if ($data && @$data) {
  313.             return $source;
  314.         }
  315.     } else {
  316.         my $data = $dbh->selectall_arrayref('select id from config.bib_source where source ~* ?', {}, "^$source");
  317.         if ($data && @$data) {
  318.             return $data->[0]->[0];
  319.         }
  320.     }
  321.     return undef;
  322. }
  323.  
  324. sub update_marc {
  325.     my $ref = shift;
  326.     state $sth = $dbh->prepare('update biblio.record_entry set marc = $2 where id = $1');
  327.     $sth->bind_param(1, $ref->{id});
  328.     $sth->bind_param(2, clean_marc($ref->{marc}));
  329.     return $sth->execute();
  330. }
  331.  
  332. sub insert_marc {
  333.     my ($source, $record) = @_;
  334.     state $sth = $dbh->prepare(<<EOINSERT
  335. insert into biblio.record_entry
  336. (source, marc, last_xact_id)
  337. values
  338. (?, ?, pg_backend_pid() || '.' || extract(epoch from now()))
  339. returning id
  340. EOINSERT
  341.     );
  342.     $sth->bind_param(1, $source);
  343.     $sth->bind_param(2, clean_marc($record));
  344.     if ($sth->execute()) {
  345.         my $data = $sth->fetchall_arrayref();
  346.         if ($data && @$data) {
  347.             return $data->[0]->[0];
  348.         }
  349.     }
  350.     return undef;
  351. }
  352.  
  353. sub date_str {
  354.     my ($file, $open) = @_;
  355.     my $dt = DateTime->now(time_zone => DateTime::TimeZone->new(name => 'local'));
  356.     return (($open) ? 'Starting' : 'Closing') . " $file at " . $dt->strftime('%a, %d %b %Y %H:%M:%S %z.');
  357. }
  358.  
  359. package MARCFixedFieldMapper;
  360.  
  361. use vars qw/$AUTOLOAD/;
  362.  
  363. sub new {
  364.     my $proto = shift;
  365.     my $class = ref $proto || $proto;
  366.     my $self = {};
  367.     my $instance = bless($self, $class);
  368.     $instance->_init_rec_type_map();
  369.     $instance->_init_fixed_field_map();
  370.     return $instance;
  371. }
  372.  
  373. sub _init_rec_type_map {
  374.     my $self = shift;
  375.     eval {
  376.         $self->{marc21_rec_type_map} = $dbh->selectall_hashref('select * from config.marc21_rec_type_map', 'code');
  377.     };
  378.     if ($@) {
  379.         die("Failed to initialize MARCFixedFieldMapper: $@");
  380.     }
  381. }
  382.  
  383. sub _init_fixed_field_map {
  384.     my $self = shift;
  385.     eval {
  386.         $self->{marc21_ff_pos_map} = $dbh->selectall_hashref('select * from config.marc21_ff_pos_map',
  387.                                                              ['fixed_field', 'rec_type', 'tag']);
  388.     };
  389.     if ($@) {
  390.         die("Failed to initialize MARCFixedFieldMapper: $@");
  391.     }
  392.     $self->{field_map} = {};
  393.     foreach my $ff (keys %{$self->{marc21_ff_pos_map}}) {
  394.         my $f = lc($ff);
  395.         $f =~ s|/||;
  396.         $self->{field_map}->{$f} = $ff;
  397.     }
  398. }
  399.  
  400. sub item_type {
  401.     my $self = shift;
  402.     my $record = shift;
  403.     my $ldr = $record->leader();
  404.     return substr($ldr, 6, 1);
  405. }
  406.  
  407. sub bib_level {
  408.     my $self = shift;
  409.     my $record = shift;
  410.     my $ldr = $record->leader();
  411.     return substr($ldr, 7, 1);
  412. }
  413.  
  414. sub rec_type {
  415.     my $self = shift;
  416.     my $record = shift;
  417.  
  418.     my $href = $self->{marc21_rec_type_map};
  419.     my $itype = $self->item_type($record);
  420.     my $blvl = $self->bib_level($record);
  421.     my ($rec_type) = grep {$href->{$_}->{type_val} =~ $itype && $href->{$_}->{blvl_val} =~ $blvl} keys %$href;
  422.     return $rec_type;
  423. }
  424.  
  425. sub AUTOLOAD {
  426.     my $self = shift;
  427.     my $record = shift;
  428.  
  429.     my $field = $AUTOLOAD;
  430.     $field =~ s/.*:://;
  431.     if ($self->{field_map}->{$field}) {
  432.         my $ffield = $self->{field_map}->{$field};
  433.         my $rec_type = $self->rec_type($record);
  434.         my $map = $self->{marc21_ff_pos_map}->{$ffield}->{$rec_type};
  435.         if ($map) {
  436.             my $val;
  437.             foreach (keys %$map) {
  438.                 my $start = $map->{$_}->{start_pos};
  439.                 my $length = $map->{$_}->{length};
  440.                 my $default_val = $map->{$_}->{default_val};
  441.                 my $str;
  442.                 if ($_ eq 'ldr') {
  443.                     $str = $record->leader();
  444.                 } else {
  445.                     my $mfield = $record->field($_);
  446.                     if ($mfield && $mfield->is_control_field()) {
  447.                         $str = $mfield->data();
  448.                     }
  449.                 }
  450.                 if ($str && length($str) >= $start + $length) {
  451.                     $val = substr($str, $start, $length);
  452.                 }
  453.                 last if ($val && $val ne $default_val);
  454.                 $val = $default_val unless ($val);
  455.             }
  456.             return $val;
  457.         }
  458.     }
  459.     return undef;
  460. }
  461.  
  462. 1;
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top