Advertisement
Dyrcona

load_erecords.pl

Nov 8th, 2018 (edited)
786
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 15.95 KB | None | 0 0
  1. #!/usr/bin/perl
  2. # ---------------------------------------------------------------
  3. # Copyright © 2016, 2021, 2022 C/W MARS, Inc.
  4. # Jason Stephenson <jstephenson@cwmars.org>
  5. #
  6. # This program is free software; you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation; either version 2 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. # GNU General Public License for more details.
  15. # ---------------------------------------------------------------
  16.  
  17. use strict;
  18. use warnings;
  19. use feature qw/state/;
  20. use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
  21. use MARC::Record;
  22. use MARC::File::XML (BinaryEncoding => 'utf8');
  23. use OpenILS::Utils::Normalize qw(clean_marc naco_normalize);
  24. use IO::File;
  25. use DateTime;
  26. use DateTime::TimeZone;
  27. use Time::HiRes qw/tv_interval gettimeofday/;
  28. use DBI;
  29.  
  30. IO::File->input_record_separator("\x1E\x1D");
  31. IO::File->output_record_separator("\n");
  32.  
  33. # options with defaults:
  34. my $db_user = $ENV{PGUSER} || 'evergreen';
  35. my $db_host = $ENV{PGHOST} || 'db1';
  36. my $db_db = $ENV{PGDATABASE} || 'evergreen';
  37. my $db_password = $ENV{PGPASSWORD} || 'evergreen';
  38. my $db_port = $ENV{PGPORT} || 5432;
  39. my $source;
  40. my $strict;
  41. my $timing;
  42.  
  43. GetOptions("user|U=s" => \$db_user,
  44.            "host|h=s" => \$db_host,
  45.            "database|d=s" => \$db_db,
  46.            "password|P=s" => \$db_password,
  47.            "port|p=i" => \$db_port,
  48.            "source|s=s" => \$source,
  49.            "timing|t" => \$timing,
  50.            "strict|S" => \$strict) or die("Error in command line options");
  51.  
  52. my $dbh = DBI->connect("dbi:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=loaderecords",
  53.                        $db_user, $db_password,
  54.                        {PrintError => 0, RaiseError => 1, AutoCommit => 1})
  55.     or die("No database connection.");
  56.  
  57. die("Must specify --source option.") unless ($source);
  58.  
  59. $source = lookup_source($source);
  60.  
  61. die("--source is not valid.") unless ($source);
  62.  
  63. my $mapper = MARCFixedFieldMapper->new();
  64.  
  65. my ($rej, $exc); # Variables for reject and exception file handles. We only open this if necessary.
  66. my $error_count = 0; # Count of errors.
  67.  
  68. # Because this can produce lots of output, we're writing progress messages to a log file instead of standard output.
  69. my $log = IO::File->new("> log.txt");
  70.  
  71. # Make input_file and count variables be "global" to the script so we
  72. # can use them for timing logs in the find_best_match function.
  73. my ($input_file, $count);
  74.  
  75. foreach (@ARGV) {
  76.     $input_file = $_;
  77.     $count = 0;
  78.     my $fh = IO::File->new("< $input_file");
  79.     my $str = date_str($input_file, 1);
  80.     print("$str"); # For running from at, etc., so we have something in the email to let us know when it is done.
  81.     $log->print($str);
  82.     while (my $raw = <$fh>) {
  83.         $count++;
  84.         eval {
  85.             my ($match_start, $match_end, $update_start, $update_end, $message);
  86.             my $record = MARC::Record->new_from_usmarc($raw);
  87.             my @warnings = $record->warnings();
  88.             if (@warnings) {
  89.                 $log->print("$input_file $count @warnings");
  90.                 if ($strict) {
  91.                     $log->print("$input_file $count skipped");
  92.                     die("@warnings");
  93.                 }
  94.             }
  95.             $match_start = [gettimeofday()];
  96.             my $match = find_best_match($record);
  97.             $match_end = [gettimeofday()];
  98.             if ($match) {
  99.                 no warnings qw(uninitialized);
  100.                 my $update_needed = 0;
  101.                 $message = "$input_file $count matches " . $match->{id} . " with score " . $match->{score};
  102.                 $message .= " in " . tv_interval($match_start, $match_end) . " seconds" if ($timing);
  103.                 $log->print($message);
  104.                 foreach my $nfield ($record->field('856')) {
  105.                     my $add = 1;
  106.                     foreach my $ofield ($match->{marc}->field('856')) {
  107.                         if ($nfield->subfield('9') eq $ofield->subfield('9') && $nfield->subfield('u')
  108.                                 eq $ofield->subfield('u') && $nfield->subfield('3') eq $ofield->subfield('3')) {
  109.                             $add = 0;
  110.                             last;
  111.                         }
  112.                     }
  113.                     if ($add) {
  114.                         $match->{marc}->insert_fields_ordered($nfield);
  115.                         $update_needed++;
  116.                     }
  117.                 }
  118.                 if ($update_needed) {
  119.                     $update_start = [gettimeofday()];
  120.                     my $success = update_marc($match);
  121.                     $update_end = [gettimeofday()];
  122.                     if ($success == 0) { # man DBI and look for the execute statement handle description for why.
  123.                         $message = "$input_file $count update of record " . $match->{id} . " failed";
  124.                     } else {
  125.                         $message = "$input_file $count added $update_needed URL(s) to record " . $match->{id};
  126.                     }
  127.                     $message .= " in " . tv_interval($update_start, $update_end) . " seconds" if ($timing);
  128.                     $log->print($message);
  129.                 } else {
  130.                     $log->print("$input_file $count matches URL tag(s) in " . $match->{id});
  131.                 }
  132.             } else {
  133.                 if ($timing) {
  134.                     $log->print("$input_file $count did not match in " . tv_interval($match_start, $match_end) . " seconds");
  135.                 }
  136.                 $update_start = [gettimeofday()];
  137.                 my $id = insert_marc($source, $record);
  138.                 $update_end = [gettimeofday()];
  139.                 if ($id) {
  140.                     $message = "$input_file $count inserted as bre.id $id";
  141.                 } else {
  142.                     $message = "$input_file $count failed to insert";
  143.                 }
  144.                 $message .= " in " . tv_interval($update_start, $update_end) . " seconds" if ($timing);
  145.                 $log->print($message);
  146.             }
  147.         };
  148.         if ($@) {
  149.             my $error = $@;
  150.             $error =~ s/\s+$//;
  151.             $error_count++;
  152.             reject($raw);
  153.             unless ($exc) {
  154.                 $exc = IO::File->new("> exceptions.txt");
  155.             }
  156.             { local $\ = "\cM\cJ";
  157.               $exc->print("Record $error_count: $error"); }
  158.             $log->print("$input_file $count $error");
  159.         }
  160.     }
  161.     $fh->close();
  162.     $str = date_str($input_file, 0);
  163.     print("$str"); # For running from at, etc., so we have something in the email to let us know when it is done.
  164.     $log->print($str);
  165. }
  166.  
  167. END {
  168.     $dbh->disconnect() if ($dbh);
  169.     if ($log && $log->opened()) {
  170.         $log->close();
  171.     }
  172.     if ($rej && $rej->opened()) {
  173.         $rej->close();
  174.     }
  175.     if ($exc && $exc->opened()) {
  176.         $exc->close();
  177.     }
  178. }
  179.  
  180. sub reject {
  181.     my $raw = shift;
  182.     unless ($rej) {
  183.         $rej = IO::File->new("> skipped_bibs.mrc");
  184.         $rej->binmode(':raw');
  185.     }
  186.     local $\;
  187.     $rej->print($raw);
  188. }
  189.  
  190. sub find_best_match {
  191.     my $record = shift;
  192.  
  193.     # For finer-grained search timing.
  194.     my ($start, $end);
  195.  
  196.     $start = [gettimeofday()];
  197.     my $id_matches = get_identifier_matches($record);
  198.     $end = [gettimeofday()];
  199.     $log->print("$input_file $count get_identifier_matches took " . tv_interval($start, $end) . " seconds")
  200.         if ($timing);
  201.  
  202.     $start = [gettimeofday()];
  203.     my $isbn_matches = get_isbn_matches($record);
  204.     $end = [gettimeofday()];
  205.     $log->print("$input_file $count get_isbn_matches took " . tv_interval($start, $end) . " seconds")
  206.         if ($timing);
  207.  
  208.     if ($id_matches || $isbn_matches) {
  209.         my %merged;
  210.         if ($id_matches && $isbn_matches) {
  211.             %merged = %$id_matches;
  212.             foreach my $k (keys %$isbn_matches) {
  213.                 if ($merged{$k}) {
  214.                     $merged{$k}->{score} += $isbn_matches->{$k}->{score};
  215.                 } else {
  216.                     $merged{$k} = $isbn_matches->{$k};
  217.                 }
  218.             }
  219.         } elsif ($id_matches) {
  220.             %merged = %$id_matches;
  221.         } else {
  222.             %merged = %$isbn_matches;
  223.         }
  224.  
  225.         my @results = sort {$b->{score} <=> $a->{score}} sort {$b->{id} <=> $a->{id}} values %merged;
  226.         my $data = $results[0];
  227.         $data->{marc} = MARC::Record->new_from_xml($data->{marc}) if ($data && ref($data) eq 'HASH' && $data->{marc});
  228.         return $data;
  229.     }
  230.  
  231.     return undef;
  232. }
  233.  
  234. sub get_identifier_matches {
  235.     my $record = shift;
  236.  
  237.     state $sth = $dbh->prepare(<<'EOQ'
  238. select bre.id, bre.marc, 2 as score
  239. from biblio.record_entry bre
  240. join metabib.record_attr_vector_list mravl on mravl.source = bre.id
  241. join config.coded_value_map itype on idx(mravl.vlist, itype.id) > 0
  242. and itype.ctype = 'item_type' and itype.code = $1
  243. join config.coded_value_map iform on idx(mravl.vlist, iform.id) > 0
  244. and iform.ctype = 'item_form' and iform.code = $2
  245. join metabib.real_full_rec identifier on identifier.record = bre.id
  246. and identifier.tag = '035'
  247. and identifier.subfield = 'a'
  248. and identifier.value = any($3)
  249. where not bre.deleted
  250. EOQ
  251.     );
  252.  
  253.     $sth->bind_param(1, $mapper->type($record));
  254.     $sth->bind_param(2, $mapper->form($record));
  255.     $sth->bind_param(3, prepare_identifiers($record));
  256.     if ($sth->execute()) {
  257.         my $data = $sth->fetchall_hashref('id');
  258.         if ($data && %$data) {
  259.             return $data;
  260.         }
  261.     }
  262.  
  263.     return undef;
  264. }
  265.  
  266. sub get_isbn_matches {
  267.     my $record = shift;
  268.  
  269.     my $isbn_query = prepare_isbns($record);
  270.  
  271.     state $sth = $dbh->prepare(<<'EOQ'
  272. select bre.id, bre.marc, 1 as score
  273. from biblio.record_entry bre
  274. join metabib.record_attr_vector_list mravl on mravl.source = bre.id
  275. join config.coded_value_map itype on idx(mravl.vlist, itype.id) > 0
  276. and itype.ctype = 'item_type' and itype.code = $1
  277. join config.coded_value_map iform on idx(mravl.vlist, iform.id) > 0
  278. and iform.ctype = 'item_form' and iform.code = $2
  279. join metabib.real_full_rec isbn on isbn.record = bre.id
  280. and isbn.tag = '020'
  281. and isbn.subfield = 'a'
  282. and index_vector @@ $3
  283. where not bre.deleted
  284. EOQ
  285.     );
  286.  
  287.     if ($isbn_query) {
  288.         $sth->bind_param(1, $mapper->type($record));
  289.         $sth->bind_param(2, $mapper->form($record));
  290.         $sth->bind_param(3, $isbn_query);
  291.         if ($sth->execute()) {
  292.             my $data = $sth->fetchall_hashref('id');
  293.             if ($data && %$data) {
  294.                 return $data;
  295.             }
  296.         }
  297.     }
  298.  
  299.     return undef;
  300. }
  301.  
  302. sub prepare_identifiers {
  303.     my $record = shift;
  304.     my $out = [];
  305.  
  306.     my @fields = $record->field('035');
  307.     foreach my $field (@fields) {
  308.         my $str = $field->subfield('a');
  309.         push(@$out, naco_normalize($str, 'a')) if ($str);
  310.     }
  311.     return $out;
  312. }
  313.  
  314. sub prepare_isbns {
  315.     my $record = shift;
  316.     my @isbns = ();
  317.     my @fields = $record->field('020');
  318.     foreach my $field (@fields) {
  319.         my $isbn = $field->subfield('a');
  320.         next unless($isbn);
  321.         $isbn = naco_normalize($isbn, 'a');
  322.         my $idx = index($isbn, ' ');
  323.         $isbn = substr($isbn, 0, $idx) if ($idx != -1);
  324.         push(@isbns, $isbn) unless (grep {$_ eq $isbn} @isbns);
  325.     }
  326.     return join(' | ', @isbns);
  327. }
  328.  
  329. sub lookup_source {
  330.     my $source = shift;
  331.     if ($source =~ /^\d+$/) {
  332.         # check that this is a valid source id.
  333.         my $data = $dbh->selectall_arrayref("select source from config.bib_source where id = $source");
  334.         if ($data && @$data) {
  335.             return $source;
  336.         }
  337.     } else {
  338.         my $data = $dbh->selectall_arrayref('select id from config.bib_source where source ~* ?', {}, "^$source");
  339.         if ($data && @$data) {
  340.             return $data->[0]->[0];
  341.         }
  342.     }
  343.     return undef;
  344. }
  345.  
  346. sub update_marc {
  347.     my $ref = shift;
  348.     state $sth = $dbh->prepare('update biblio.record_entry set marc = $2 where id = $1');
  349.     $sth->bind_param(1, $ref->{id});
  350.     $sth->bind_param(2, clean_marc($ref->{marc}));
  351.     return $sth->execute();
  352. }
  353.  
  354. sub insert_marc {
  355.     my ($source, $record) = @_;
  356.     state $sth = $dbh->prepare(<<EOINSERT
  357. insert into biblio.record_entry
  358. (source, marc, last_xact_id)
  359. values
  360. (?, ?, pg_backend_pid() || '.' || extract(epoch from now()))
  361. returning id
  362. EOINSERT
  363.     );
  364.     $sth->bind_param(1, $source);
  365.     $sth->bind_param(2, clean_marc($record));
  366.     if ($sth->execute()) {
  367.         my $data = $sth->fetchall_arrayref();
  368.         if ($data && @$data) {
  369.             return $data->[0]->[0];
  370.         }
  371.     }
  372.     return undef;
  373. }
  374.  
  375. sub date_str {
  376.     my ($file, $open) = @_;
  377.     my $dt = DateTime->now(time_zone => DateTime::TimeZone->new(name => 'local'));
  378.     return (($open) ? 'Starting' : 'Closing') . " $file at " . $dt->strftime('%a, %d %b %Y %H:%M:%S %z.');
  379. }
  380.  
  381. package MARCFixedFieldMapper;
  382.  
  383. use vars qw/$AUTOLOAD/;
  384.  
  385. sub new {
  386.     my $proto = shift;
  387.     my $class = ref $proto || $proto;
  388.     my $self = {};
  389.     my $instance = bless($self, $class);
  390.     $instance->_init_rec_type_map();
  391.     $instance->_init_fixed_field_map();
  392.     return $instance;
  393. }
  394.  
  395. sub _init_rec_type_map {
  396.     my $self = shift;
  397.     eval {
  398.         $self->{marc21_rec_type_map} = $dbh->selectall_hashref('select * from config.marc21_rec_type_map', 'code');
  399.     };
  400.     if ($@) {
  401.         die("Failed to initialize MARCFixedFieldMapper: $@");
  402.     }
  403. }
  404.  
  405. sub _init_fixed_field_map {
  406.     my $self = shift;
  407.     eval {
  408.         $self->{marc21_ff_pos_map} = $dbh->selectall_hashref('select * from config.marc21_ff_pos_map',
  409.                                                              ['fixed_field', 'rec_type', 'tag']);
  410.     };
  411.     if ($@) {
  412.         die("Failed to initialize MARCFixedFieldMapper: $@");
  413.     }
  414.     $self->{field_map} = {};
  415.     foreach my $ff (keys %{$self->{marc21_ff_pos_map}}) {
  416.         my $f = lc($ff);
  417.         $f =~ s|/||;
  418.         $self->{field_map}->{$f} = $ff;
  419.     }
  420. }
  421.  
  422. sub item_type {
  423.     my $self = shift;
  424.     my $record = shift;
  425.     my $ldr = $record->leader();
  426.     return substr($ldr, 6, 1);
  427. }
  428.  
  429. sub bib_level {
  430.     my $self = shift;
  431.     my $record = shift;
  432.     my $ldr = $record->leader();
  433.     return substr($ldr, 7, 1);
  434. }
  435.  
  436. sub rec_type {
  437.     my $self = shift;
  438.     my $record = shift;
  439.  
  440.     my $href = $self->{marc21_rec_type_map};
  441.     my $itype = $self->item_type($record);
  442.     my $blvl = $self->bib_level($record);
  443.     my ($rec_type) = grep {$href->{$_}->{type_val} =~ $itype && $href->{$_}->{blvl_val} =~ $blvl} keys %$href;
  444.     return $rec_type;
  445. }
  446.  
  447. sub AUTOLOAD {
  448.     my $self = shift;
  449.     my $record = shift;
  450.  
  451.     my $field = $AUTOLOAD;
  452.     $field =~ s/.*:://;
  453.     if ($self->{field_map}->{$field}) {
  454.         my $ffield = $self->{field_map}->{$field};
  455.         my $rec_type = $self->rec_type($record);
  456.         my $map = $self->{marc21_ff_pos_map}->{$ffield}->{$rec_type};
  457.         if ($map) {
  458.             my $val;
  459.             foreach (keys %$map) {
  460.                 my $start = $map->{$_}->{start_pos};
  461.                 my $length = $map->{$_}->{length};
  462.                 my $default_val = $map->{$_}->{default_val};
  463.                 my $str;
  464.                 if ($_ eq 'ldr') {
  465.                     $str = $record->leader();
  466.                 } else {
  467.                     my $mfield = $record->field($_);
  468.                     if ($mfield && $mfield->is_control_field()) {
  469.                         $str = $mfield->data();
  470.                     }
  471.                 }
  472.                 if ($str && length($str) >= $start + $length) {
  473.                     $val = substr($str, $start, $length);
  474.                 }
  475.                 last if ($val && $val ne $default_val);
  476.                 $val = $default_val unless ($val);
  477.             }
  478.             return $val;
  479.         }
  480.     }
  481.     return undef;
  482. }
  483.  
  484. 1;
  485.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement