Dyrcona

load_erecords.pl

Nov 8th, 2018 (edited)
378
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/perl
  2. # ---------------------------------------------------------------
  3. # Copyright © 2016 C/W MARS, Inc.
  4. # Jason Stephenson <jstephenson@cwmars.org>
  5. #
  6. # This program is free software; you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation; either version 2 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. # GNU General Public License for more details.
  15. # ---------------------------------------------------------------
  16.  
  17. use strict;
  18. use warnings;
  19. use feature qw/state/;
  20. use Getopt::Long;
  21. use MARC::Record;
  22. use MARC::File::XML (BinaryEncoding => 'utf8');
  23. use OpenILS::Utils::Normalize qw(clean_marc naco_normalize);
  24. use IO::File;
  25. use DateTime;
  26. use DateTime::TimeZone;
  27. use Time::HiRes qw/tv_interval gettimeofday/;
  28. use DBI;
  29.  
  30. IO::File->input_record_separator("\x1E\x1D");
  31. IO::File->output_record_separator("\n");
  32.  
  33. # options with defaults:
  34. my $db_user = $ENV{PGUSER} || 'evergreen';
  35. my $db_host = $ENV{PGHOST} || 'db1';
  36. my $db_db = $ENV{PGDATABASE} || 'evergreen';
  37. my $db_password = $ENV{PGPASSWORD} || 'evergreen';
  38. my $db_port = $ENV{PGPORT} || 5432;
  39. my $source;
  40. my $strict;
  41. my $timing;
  42.  
  43. GetOptions("user=s" => \$db_user,
  44.            "host=s" => \$db_host,
  45.            "db=s" => \$db_db,
  46.            "password=s" => \$db_password,
  47.            "port=i" => \$db_port,
  48.            "source=s" => \$source,
  49.            timing => \$timing,
  50.            strict => \$strict) or die("Error in command line options");
  51.  
  52. my $dbh = DBI->connect("dbi:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=loaderecords",
  53.                        $db_user, $db_password,
  54.                        {PrintError => 0, RaiseError => 1, AutoCommit => 1})
  55.     or die("No database connection.");
  56.  
  57. die("Must specify --source option.") unless ($source);
  58.  
  59. $source = lookup_source($source);
  60.  
  61. die("--source is not valid.") unless ($source);
  62.  
  63. my $mapper = MARCFixedFieldMapper->new();
  64.  
  65. my ($rej, $exc); # Variables for reject and exception file handles. We only open this if necessary.
  66. my $error_count = 0; # Count of errors.
  67.  
  68. # Because this can produce lots of output, we're writing progress messages to a log file instead of standard output.
  69. my $log = IO::File->new("> log.txt");
  70.  
  71. # Make input_file and count variables be "global" to the script so we
  72. # can use them for timing logs in the find_best_match function.
  73. my ($input_file, $count);
  74.  
  75. foreach (@ARGV) {
  76.     $input_file = $_;
  77.     $count = 0;
  78.     my $fh = IO::File->new("< $input_file");
  79.     my $str = date_str($input_file, 1);
  80.     print("$str"); # For running from at, etc., so we have something in the email to let us know when it is done.
  81.     $log->print($str);
  82.     while (my $raw = <$fh>) {
  83.         $count++;
  84.         eval {
  85.             my ($match_start, $match_end, $update_start, $update_end, $message);
  86.             my $record = MARC::Record->new_from_usmarc($raw);
  87.             my @warnings = $record->warnings();
  88.             $match_start = [gettimeofday()];
  89.             my $match = find_best_match($record);
  90.             $match_end = [gettimeofday()];
  91.             if ($match) {
  92.                 no warnings qw(uninitialized);
  93.                 my $update_needed = 0;
  94.                 $message = "$input_file $count matches " . $match->{id} . " with score " . $match->{score};
  95.                 $message .= " in " . tv_interval($match_start, $match_end) . " seconds" if ($timing);
  96.                 $log->print($message);
  97.                 foreach my $nfield ($record->field('856')) {
  98.                     my $add = 1;
  99.                     foreach my $ofield ($match->{marc}->field('856')) {
  100.                         if ($nfield->subfield('9') eq $ofield->subfield('9') && $nfield->subfield('u')
  101.                                 eq $ofield->subfield('u')) {
  102.                             $add = 0;
  103.                             last;
  104.                         }
  105.                     }
  106.                     if ($add) {
  107.                         $match->{marc}->insert_fields_ordered($nfield);
  108.                         $update_needed++;
  109.                     }
  110.                 }
  111.                 if ($update_needed) {
  112.                     $update_start = [gettimeofday()];
  113.                     my $success = update_marc($match);
  114.                     $update_end = [gettimeofday()];
  115.                     if ($success == 0) { # man DBI and look for the execute statement handle description for why.
  116.                         $message = "$input_file $count update of record " . $match->{id} . " failed";
  117.                     } else {
  118.                         $message = "$input_file $count added $update_needed URL(s) to record " . $match->{id};
  119.                     }
  120.                     $message .= " in " . tv_interval($update_start, $update_end) . " seconds" if ($timing);
  121.                     $log->print($message);
  122.                 } else {
  123.                     $log->print("$input_file $count matches URL tag(s) in " . $match->{id});
  124.                 }
  125.             } else {
  126.                 if ($timing) {
  127.                     $log->print("$input_file $count did not match in " . tv_interval($match_start, $match_end) . " seconds");
  128.                 }
  129.                 if (@warnings) {
  130.                     if ($strict) {
  131.                         die("@warnings");
  132.                     } else {
  133.                         $log->print("$input_file $count @warnings");
  134.                     }
  135.                 }
  136.                 $update_start = [gettimeofday()];
  137.                 my $id = insert_marc($source, $record);
  138.                 $update_end = [gettimeofday()];
  139.                 if ($id) {
  140.                     $message = "$input_file $count inserted as bre.id $id";
  141.                 } else {
  142.                     $message = "$input_file $count failed to insert";
  143.                 }
  144.                 $message .= " in " . tv_interval($update_start, $update_end) . " seconds" if ($timing);
  145.                 $log->print($message);
  146.             }
  147.         };
  148.         if ($@) {
  149.             my $error = $@;
  150.             $error =~ s/\s+$//;
  151.             $error_count++;
  152.             unless ($rej) {
  153.                 $rej = IO::File->new("> skipped_bibs.mrc");
  154.                 $rej->binmode(':raw');
  155.             }
  156.             unless ($exc) {
  157.                 $exc = IO::File->new("> exceptions.txt");
  158.             }
  159.             { local $\; # Just makin' sure.
  160.               $rej->print($raw); }
  161.             { local $\ = "\cM\cJ";
  162.               $exc->print("Record $error_count: $error"); }
  163.             $log->print("$input_file $count $error");
  164.         }
  165.     }
  166.     $fh->close();
  167.     $str = date_str($input_file, 0);
  168.     print("$str"); # For running from at, etc., so we have something in the email to let us know when it is done.
  169.     $log->print($str);
  170. }
  171.  
  172. END {
  173.     $dbh->disconnect() if ($dbh);
  174.     if ($log && $log->opened()) {
  175.         $log->close();
  176.     }
  177.     if ($rej && $rej->opened()) {
  178.         $rej->close();
  179.     }
  180.     if ($exc && $exc->opened()) {
  181.         $exc->close();
  182.     }
  183. }
  184.  
  185. sub find_best_match {
  186.     my $record = shift;
  187.  
  188.     # For finer-grained search timing.
  189.     my ($start, $end);
  190.  
  191.     $start = [gettimeofday()];
  192.     my $id_matches = get_identifier_matches($record);
  193.     $end = [gettimeofday()];
  194.     $log->print("$input_file $count get_identifier_matches took " . tv_interval($start, $end) . " seconds")
  195.         if ($timing);
  196.  
  197.     $start = [gettimeofday()];
  198.     my $isbn_matches = get_isbn_matches($record);
  199.     $end = [gettimeofday()];
  200.     $log->print("$input_file $count get_isbn_matches took " . tv_interval($start, $end) . " seconds")
  201.         if ($timing);
  202.  
  203.     if ($id_matches || $isbn_matches) {
  204.         my %merged;
  205.         if ($id_matches && $isbn_matches) {
  206.             %merged = %$id_matches;
  207.             foreach my $k (keys %$isbn_matches) {
  208.                 if ($merged{$k}) {
  209.                     $merged{$k}->{score} += $isbn_matches->{$k}->{score};
  210.                 } else {
  211.                     $merged{$k} = $isbn_matches->{$k};
  212.                 }
  213.             }
  214.         } elsif ($id_matches) {
  215.             %merged = %$id_matches;
  216.         } else {
  217.             %merged = %$isbn_matches;
  218.         }
  219.  
  220.         my @results = sort {$b->{score} <=> $a->{score}} sort {$b->{id} <=> $a->{id}} values %merged;
  221.         my $data = $results[0];
  222.         $data->{marc} = MARC::Record->new_from_xml($data->{marc}) if ($data && ref($data) eq 'HASH' && $data->{marc});
  223.         return $data;
  224.     }
  225.  
  226.     return undef;
  227. }
  228.  
  229. sub get_identifier_matches {
  230.     my $record = shift;
  231.  
  232.     state $sth = $dbh->prepare(<<'EOQ'
  233. select bre.id, bre.marc, 2 as score
  234. from biblio.record_entry bre
  235. join metabib.record_attr_vector_list mravl on mravl.source = bre.id
  236. join config.coded_value_map itype on idx(mravl.vlist, itype.id) > 0
  237. and itype.ctype = 'item_type' and itype.code = $1
  238. join config.coded_value_map iform on idx(mravl.vlist, iform.id) > 0
  239. and iform.ctype = 'item_form' and iform.code = $2
  240. join metabib.real_full_rec identifier on identifier.record = bre.id
  241. and identifier.tag = '035'
  242. and identifier.subfield = 'a'
  243. and identifier.value = any($3)
  244. where not bre.deleted
  245. EOQ
  246.     );
  247.  
  248.     $sth->bind_param(1, $mapper->type($record));
  249.     $sth->bind_param(2, $mapper->form($record));
  250.     $sth->bind_param(3, prepare_identifiers($record));
  251.     if ($sth->execute()) {
  252.         my $data = $sth->fetchall_hashref('id');
  253.         if ($data && %$data) {
  254.             return $data;
  255.         }
  256.     }
  257.  
  258.     return undef;
  259. }
  260.  
  261. sub get_isbn_matches {
  262.     my $record = shift;
  263.  
  264.     my $isbn_query = prepare_isbns($record);
  265.  
  266.     state $sth = $dbh->prepare(<<'EOQ'
  267. select bre.id, bre.marc, 1 as score
  268. from biblio.record_entry bre
  269. join metabib.record_attr_vector_list mravl on mravl.source = bre.id
  270. join config.coded_value_map itype on idx(mravl.vlist, itype.id) > 0
  271. and itype.ctype = 'item_type' and itype.code = $1
  272. join config.coded_value_map iform on idx(mravl.vlist, iform.id) > 0
  273. and iform.ctype = 'item_form' and iform.code = $2
  274. join metabib.real_full_rec isbn on isbn.record = bre.id
  275. and isbn.tag = '020'
  276. and isbn.subfield = 'a'
  277. and index_vector @@ $3
  278. where not bre.deleted
  279. EOQ
  280.     );
  281.  
  282.     if ($isbn_query) {
  283.         $sth->bind_param(1, $mapper->type($record));
  284.         $sth->bind_param(2, $mapper->form($record));
  285.         $sth->bind_param(3, $isbn_query);
  286.         if ($sth->execute()) {
  287.             my $data = $sth->fetchall_hashref('id');
  288.             if ($data && %$data) {
  289.                 return $data;
  290.             }
  291.         }
  292.     }
  293.  
  294.     return undef;
  295. }
  296.  
  297. sub prepare_identifiers {
  298.     my $record = shift;
  299.     my $out = [];
  300.  
  301.     my @fields = $record->field('035');
  302.     foreach my $field (@fields) {
  303.         my $str = $field->subfield('a');
  304.         push(@$out, naco_normalize($str, 'a')) if ($str);
  305.     }
  306.     return $out;
  307. }
  308.  
  309. sub prepare_isbns {
  310.     my $record = shift;
  311.     my @isbns = ();
  312.     my @fields = $record->field('020');
  313.     foreach my $field (@fields) {
  314.         my $isbn = $field->subfield('a');
  315.         next unless($isbn);
  316.         $isbn = naco_normalize($isbn, 'a');
  317.         my $idx = index($isbn, ' ');
  318.         $isbn = substr($isbn, 0, $idx) if ($idx != -1);
  319.         push(@isbns, $isbn) unless (grep {$_ eq $isbn} @isbns);
  320.     }
  321.     return join(' | ', @isbns);
  322. }
  323.  
  324. sub lookup_source {
  325.     my $source = shift;
  326.     if ($source =~ /^\d+$/) {
  327.         # check that this is a valid source id.
  328.         my $data = $dbh->selectall_arrayref("select source from config.bib_source where id = $source");
  329.         if ($data && @$data) {
  330.             return $source;
  331.         }
  332.     } else {
  333.         my $data = $dbh->selectall_arrayref('select id from config.bib_source where source ~* ?', {}, "^$source");
  334.         if ($data && @$data) {
  335.             return $data->[0]->[0];
  336.         }
  337.     }
  338.     return undef;
  339. }
  340.  
  341. sub update_marc {
  342.     my $ref = shift;
  343.     state $sth = $dbh->prepare('update biblio.record_entry set marc = $2 where id = $1');
  344.     $sth->bind_param(1, $ref->{id});
  345.     $sth->bind_param(2, clean_marc($ref->{marc}));
  346.     return $sth->execute();
  347. }
  348.  
  349. sub insert_marc {
  350.     my ($source, $record) = @_;
  351.     state $sth = $dbh->prepare(<<EOINSERT
  352. insert into biblio.record_entry
  353. (source, marc, last_xact_id)
  354. values
  355. (?, ?, pg_backend_pid() || '.' || extract(epoch from now()))
  356. returning id
  357. EOINSERT
  358.     );
  359.     $sth->bind_param(1, $source);
  360.     $sth->bind_param(2, clean_marc($record));
  361.     if ($sth->execute()) {
  362.         my $data = $sth->fetchall_arrayref();
  363.         if ($data && @$data) {
  364.             return $data->[0]->[0];
  365.         }
  366.     }
  367.     return undef;
  368. }
  369.  
  370. sub date_str {
  371.     my ($file, $open) = @_;
  372.     my $dt = DateTime->now(time_zone => DateTime::TimeZone->new(name => 'local'));
  373.     return (($open) ? 'Starting' : 'Closing') . " $file at " . $dt->strftime('%a, %d %b %Y %H:%M:%S %z.');
  374. }
  375.  
  376. package MARCFixedFieldMapper;
  377.  
  378. use vars qw/$AUTOLOAD/;
  379.  
  380. sub new {
  381.     my $proto = shift;
  382.     my $class = ref $proto || $proto;
  383.     my $self = {};
  384.     my $instance = bless($self, $class);
  385.     $instance->_init_rec_type_map();
  386.     $instance->_init_fixed_field_map();
  387.     return $instance;
  388. }
  389.  
  390. sub _init_rec_type_map {
  391.     my $self = shift;
  392.     eval {
  393.         $self->{marc21_rec_type_map} = $dbh->selectall_hashref('select * from config.marc21_rec_type_map', 'code');
  394.     };
  395.     if ($@) {
  396.         die("Failed to initialize MARCFixedFieldMapper: $@");
  397.     }
  398. }
  399.  
  400. sub _init_fixed_field_map {
  401.     my $self = shift;
  402.     eval {
  403.         $self->{marc21_ff_pos_map} = $dbh->selectall_hashref('select * from config.marc21_ff_pos_map',
  404.                                                              ['fixed_field', 'rec_type', 'tag']);
  405.     };
  406.     if ($@) {
  407.         die("Failed to initialize MARCFixedFieldMapper: $@");
  408.     }
  409.     $self->{field_map} = {};
  410.     foreach my $ff (keys %{$self->{marc21_ff_pos_map}}) {
  411.         my $f = lc($ff);
  412.         $f =~ s|/||;
  413.         $self->{field_map}->{$f} = $ff;
  414.     }
  415. }
  416.  
  417. sub item_type {
  418.     my $self = shift;
  419.     my $record = shift;
  420.     my $ldr = $record->leader();
  421.     return substr($ldr, 6, 1);
  422. }
  423.  
  424. sub bib_level {
  425.     my $self = shift;
  426.     my $record = shift;
  427.     my $ldr = $record->leader();
  428.     return substr($ldr, 7, 1);
  429. }
  430.  
  431. sub rec_type {
  432.     my $self = shift;
  433.     my $record = shift;
  434.  
  435.     my $href = $self->{marc21_rec_type_map};
  436.     my $itype = $self->item_type($record);
  437.     my $blvl = $self->bib_level($record);
  438.     my ($rec_type) = grep {$href->{$_}->{type_val} =~ $itype && $href->{$_}->{blvl_val} =~ $blvl} keys %$href;
  439.     return $rec_type;
  440. }
  441.  
  442. sub AUTOLOAD {
  443.     my $self = shift;
  444.     my $record = shift;
  445.  
  446.     my $field = $AUTOLOAD;
  447.     $field =~ s/.*:://;
  448.     if ($self->{field_map}->{$field}) {
  449.         my $ffield = $self->{field_map}->{$field};
  450.         my $rec_type = $self->rec_type($record);
  451.         my $map = $self->{marc21_ff_pos_map}->{$ffield}->{$rec_type};
  452.         if ($map) {
  453.             my $val;
  454.             foreach (keys %$map) {
  455.                 my $start = $map->{$_}->{start_pos};
  456.                 my $length = $map->{$_}->{length};
  457.                 my $default_val = $map->{$_}->{default_val};
  458.                 my $str;
  459.                 if ($_ eq 'ldr') {
  460.                     $str = $record->leader();
  461.                 } else {
  462.                     my $mfield = $record->field($_);
  463.                     if ($mfield && $mfield->is_control_field()) {
  464.                         $str = $mfield->data();
  465.                     }
  466.                 }
  467.                 if ($str && length($str) >= $start + $length) {
  468.                     $val = substr($str, $start, $length);
  469.                 }
  470.                 last if ($val && $val ne $default_val);
  471.                 $val = $default_val unless ($val);
  472.             }
  473.             return $val;
  474.         }
  475.     }
  476.     return undef;
  477. }
  478.  
  479. 1;
  480.  
RAW Paste Data