Advertisement
roganhamby

Annual Cleanup - Bibliographic Deduplication

Jul 21st, 2015
352
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.54 KB | None | 0 0
  1. -- SCLENDS bibliographic dedupe routine
  2. --
  3. -- Copyright 2010-2011 Equinox Software, Inc.
  4. -- Author: Galen Charlton
  5. --
  6. -- This implements a bibliographic deduplication routine based
  7. -- on criteria and an algorithm specified by the South Carolina
  8. -- State Library on behalf of the SC LENDS consortium. This work
  9. -- was sponsored by SC LENDS, whose impetus is gratefully
  10. -- acknowledged. Portions of this script were subseqently expanded
  11. -- based on the advice of the Indiana State Library on the behalf
  12. -- of the Evergreen Indiana project.
  13.  
  14. -- schema to store the dedupe routine and intermediate data
  15. CREATE SCHEMA m_dedupe;
  16.  
  17. CREATE TYPE mig_isbn_match AS (norm_isbn TEXT, norm_title TEXT, qual TEXT, bibid BIGINT);
  18.  
  19. -- function to calculate the normalized ISBN and title match keys
  20. -- and the bibliographic portion of the quality score. The normalized
  21. -- ISBN key consists of the set of 020$a and 020$z normalized as follows:
  22. -- * numeric portion of the ISBN converted to ISBN-13 format
  23. --
  24. -- The normalized title key is taken FROM the 245$a with the nonfiling
  25. -- characters and leading and trailing whitespace removed, ampersands
  26. -- converted to ' and ', other punctuation removed, and the text converted
  27. -- to lowercase.
  28. --
  29. -- The quality score is a 19-digit integer computed by concatenating
  30. -- counts of various attributes in the MARC records; see the get_quality
  31. -- routine for details.
  32. --
  33. CREATE OR REPLACE FUNCTION m_dedupe.get_isbn_match_key (bib_id BIGINT, marc TEXT) RETURNS SETOF mig_isbn_match AS $func$
  34. use strict;
  35. use warnings;
  36.  
  37. use MARC::Record;
  38. use MARC::File::XML (BinaryEncoding => 'utf8');
  39. use Business::ISBN;
  40.  
  41. binmode(STDERR, ':bytes');
  42. binmode(STDOUT, ':utf8');
  43. binmode(STDERR, ':utf8');
  44.  
  45. my $get_quality = sub {
  46. my $marc = shift;
  47.  
  48. my $has003 = (scalar($marc->field('003'))) ? '1' : '0';
  49.  
  50. return join('', $has003,
  51. count_field($marc, '02.'),
  52. count_field($marc, '24.'),
  53. field_length($marc, '300'),
  54. field_length($marc, '100'),
  55. count_field($marc, '010'),
  56. count_field($marc, '50.', '51.', '52.', '53.', '54.', '55.', '56.', '57.', '58.'),
  57. count_field($marc, '6..'),
  58. count_field($marc, '440', '490', '830'),
  59. count_field($marc, '7..'),
  60. );
  61. };
  62.  
  63. my ($bibid, $xml) = @_;
  64.  
  65. $xml =~ s/(<leader>.........)./${1}a/;
  66. my $marc;
  67. eval {
  68. $marc = MARC::Record->new_from_xml($xml);
  69. };
  70. if ($@) {
  71. #elog("could not parse $bibid: $@\n");
  72. import MARC::File::XML (BinaryEncoding => 'utf8');
  73. return;
  74. }
  75.  
  76. my @f245 = $marc->field('245');
  77. return unless @f245; # must have 245
  78. my $norm_title = norm_title($f245[0]);
  79. return unless $norm_title ne '';
  80.  
  81. my @isbns = $marc->field('020');
  82. return unless @isbns; # must have at least 020
  83.  
  84. my $qual = $get_quality->($marc);
  85.  
  86. my @norm_isbns = norm_isbns(@isbns);
  87. foreach my $isbn (@norm_isbns) {
  88. return_next({ norm_isbn => $isbn, norm_title => $norm_title, qual => $qual, bibid => $bibid });
  89. }
  90. return undef;
  91.  
  92.  
  93. sub count_field {
  94. my ($marc) = shift;
  95. my @tags = @_;
  96. my $total = 0;
  97. foreach my $tag (@tags) {
  98. my @f = $marc->field($tag);
  99. $total += scalar(@f);
  100. }
  101. $total = 99 if $total > 99;
  102. return sprintf("%-02.2d", $total);
  103. }
  104.  
  105. sub field_length {
  106. my $marc = shift;
  107. my $tag = shift;
  108.  
  109. my @f = $marc->field($tag);
  110. return '00' unless @f;
  111. my $len = length($f[0]->as_string);
  112. $len = 99 if $len > 99;
  113. return sprintf("%-02.2d", $len);
  114. }
  115.  
  116. sub norm_title {
  117. my $f245 = shift;
  118. my $sfa = $f245->subfield('a');
  119. return '' unless defined $sfa;
  120. my $nonf = $f245->indicator(2);
  121. $nonf = '0' unless $nonf =~ /^\d$/;
  122. if ($nonf == 0) {
  123. $sfa =~ s/^a //i;
  124. $sfa =~ s/^an //i;
  125. $sfa =~ s/^the //i;
  126. } else {
  127. $sfa = substr($sfa, $nonf);
  128. }
  129. $sfa =~ s/&/ and /g;
  130. $sfa = lc $sfa;
  131. $sfa =~ s/\[large print\]//;
  132. $sfa =~ s/[[:punct:]]//g;
  133. $sfa =~ s/^\s+//;
  134. $sfa =~ s/\s+$//;
  135. $sfa =~ s/\s+/ /g;
  136. return $sfa;
  137. }
  138.  
  139. sub norm_isbns {
  140. my @isbns = @_;
  141.  
  142. my %uniq_isbns = ();
  143. foreach my $field (@isbns) {
  144. my $sfa = $field->subfield('a');
  145. my $norm = norm_isbn($sfa);
  146. $uniq_isbns{$norm}++ unless $norm eq '';
  147. my $sfz = $field->subfield('z');
  148. $norm = norm_isbn($sfz);
  149. $uniq_isbns{$norm}++ unless $norm eq '';
  150. }
  151. return sort(keys %uniq_isbns);
  152. }
  153.  
  154. sub norm_isbn {
  155. my $str = shift;
  156. my $norm = '';
  157. return '' unless defined $str;
  158. $str =~ s/-//g;
  159. $str =~ s/^\s+//;
  160. $str =~ s/\s+$//;
  161. $str =~ s/\s+//g;
  162. $str = lc $str;
  163. my $isbn;
  164. if ($str =~ /^(\d{12}[0-9-x])/) {
  165. $isbn = $1;
  166. $norm = $isbn;
  167. } elsif ($str =~ /^(\d{9}[0-9x])/) {
  168. $isbn = Business::ISBN->new($1);
  169. my $isbn13 = $isbn->as_isbn13;
  170. $norm = lc($isbn13->as_string);
  171. $norm =~ s/-//g;
  172. }
  173. return $norm;
  174. }
  175. $func$ LANGUAGE PLPERLU;
  176.  
  177. -- Specify set of bibs to dedupe. This version
  178. -- simply collects the IDs of all non-deleted bibs,
  179. -- but the query could be expanded to exclude bibliographic
  180. -- records that should not participate in the deduplication.
  181.  
  182. -- note: exapnded to exclude certain bibs - rrh
  183.  
  184. CREATE TABLE m_dedupe.bibs_to_check AS
  185. SELECT bre.id AS bib_id
  186. FROM biblio.record_entry bre
  187. join config.bib_source bs on bs.id = bre.source
  188. WHERE bre.deleted = FALSE and bs.can_have_copies = TRUE;
  189.  
  190. -- staging table for the match keys
  191. CREATE TABLE m_dedupe.match_keys (
  192. norm_isbn TEXT,
  193. norm_title TEXT,
  194. qual TEXT,
  195. bibid BIGINT
  196. );
  197.  
  198. -- calculate match keys
  199. INSERT INTO m_dedupe.match_keys
  200. SELECT (a.get_isbn_match_key::mig_isbn_match).norm_isbn,
  201. (a.get_isbn_match_key::mig_isbn_match).norm_title,
  202. (a.get_isbn_match_key::mig_isbn_match).qual,
  203. (a.get_isbn_match_key::mig_isbn_match).bibid
  204. FROM (
  205. SELECT m_dedupe.get_isbn_match_key(bre.id, bre.marc)
  206. FROM biblio.record_entry bre
  207. JOIN m_dedupe.bibs_to_check c ON (c.bib_id = bre.id)
  208. ) a;
  209.  
  210. CREATE INDEX norm_idx on m_dedupe.match_keys(norm_isbn, norm_title);
  211. CREATE INDEX qual_idx on m_dedupe.match_keys(qual);
  212.  
  213. -- and remove duplicates
  214. CREATE TEMPORARY TABLE uniq_match_keys AS
  215. SELECT DISTINCT norm_isbn, norm_title, qual, bibid
  216. FROM m_dedupe.match_keys;
  217.  
  218. DELETE FROM m_dedupe.match_keys;
  219. INSERT INTO m_dedupe.match_keys SELECT * FROM uniq_match_keys;
  220.  
  221. -- find highest-quality match keys
  222. CREATE TABLE m_dedupe.lead_quals AS
  223. SELECT max(qual) as max_qual, norm_isbn, norm_title
  224. FROM m_dedupe.match_keys
  225. GROUP BY norm_isbn, norm_title
  226. HAVING COUNT(*) > 1;
  227.  
  228. CREATE INDEX norm_idx2 ON m_dedupe.lead_quals(norm_isbn, norm_title);
  229. CREATE INDEX norm_qual_idx2 ON m_dedupe.lead_quals(norm_isbn, norm_title, max_qual);
  230.  
  231. -- identify prospective lead bibs
  232. CREATE TABLE m_dedupe.prospective_leads AS
  233. SELECT bibid, a.norm_isbn, a.norm_title, b.max_qual, count(ac.id) as copy_count
  234. FROM m_dedupe.match_keys a
  235. JOIN m_dedupe.lead_quals b on (a.qual = b.max_qual and a.norm_isbn = b.norm_isbn and a.norm_title = b.norm_title)
  236. JOIN asset.call_number acn on (acn.record = bibid)
  237. JOIN asset.copy ac on (ac.call_number = acn.id)
  238. WHERE not acn.deleted
  239. and not ac.deleted
  240. GROUP BY bibid, a.norm_isbn, a.norm_title, b.max_qual;
  241.  
  242. -- and use number of copies to break ties
  243. CREATE TABLE m_dedupe.best_lead_keys AS
  244. SELECT norm_isbn, norm_title, max_qual, max(copy_count) AS copy_count
  245. FROM m_dedupe.prospective_leads
  246. GROUP BY norm_isbn, norm_title, max_qual;
  247.  
  248. CREATE TABLE m_dedupe.best_leads AS
  249. SELECT bibid, a.norm_isbn, a.norm_title, a.max_qual, copy_count
  250. FROM m_dedupe.best_lead_keys a
  251. JOIN m_dedupe.prospective_leads b USING (norm_isbn, norm_title, max_qual, copy_count);
  252.  
  253. -- and break any remaining ties using the lowest bib ID as the winner
  254. CREATE TABLE m_dedupe.unique_leads AS
  255. SELECT MIN(bibid) AS lead_bibid, norm_isbn, norm_title, max_qual
  256. FROM m_dedupe.best_leads
  257. GROUP BY norm_isbn, norm_title, max_qual;
  258.  
  259. -- start computing the merge map
  260. CREATE TABLE m_dedupe.merge_map_pre
  261. AS SELECT distinct lead_bibid, bibid as sub_bibid
  262. FROM m_dedupe.unique_leads
  263. JOIN m_dedupe.match_keys using (norm_isbn, norm_title)
  264. WHERE lead_bibid <> bibid;
  265.  
  266. -- and resolve transitive maps
  267. UPDATE m_dedupe.merge_map_pre a
  268. SET lead_bibid = b.lead_bibid
  269. FROM m_dedupe.merge_map_pre b
  270. WHERE a.lead_bibid = b.sub_bibid;
  271.  
  272. UPDATE m_dedupe.merge_map_pre a
  273. SET lead_bibid = b.lead_bibid
  274. FROM m_dedupe.merge_map_pre b
  275. WHERE a.lead_bibid = b.sub_bibid;
  276.  
  277. UPDATE m_dedupe.merge_map_pre a
  278. SET lead_bibid = b.lead_bibid
  279. FROM m_dedupe.merge_map_pre b
  280. WHERE a.lead_bibid = b.sub_bibid;
  281.  
  282. -- and produce the final merge map
  283. CREATE TABLE m_dedupe.merge_map
  284. AS SELECT min(lead_bibid) as lead_bibid, sub_bibid
  285. FROM m_dedupe.merge_map_pre
  286. GROUP BY sub_bibid;
  287.  
  288. -- add a unique ID to the merge map so that
  289. -- we can do the actual record merging in chunks
  290. ALTER TABLE m_dedupe.merge_map ADD COLUMN id serial, ADD COLUMN done BOOLEAN DEFAULT FALSE;
  291.  
  292. -- and here's an example of processing a chunk of a 1000
  293. -- merges
  294. SELECT asset.merge_record_assets(lead_bibid, sub_bibid)
  295. FROM m_dedupe.merge_map WHERE id in (
  296. SELECT id FROM m_dedupe.merge_map
  297. WHERE done = false
  298. ORDER BY id
  299. LIMIT 1000
  300. );
  301.  
  302. UPDATE m_dedupe.merge_map set done = true
  303. WHERE id in (
  304. SELECT id FROM m_dedupe.merge_map
  305. WHERE done = false
  306. ORDER BY id
  307. LIMIT 1000
  308. );
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement