WorldTeacher

code

Apr 27th, 2022 (edited)
686
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.12 KB | None | 0 0
  1. import os
  2. import glob
  3. import time
  4. from urllib.request import urlopen
  5. import pandas as pd
  6. import xml.etree.ElementTree as ET
  7. count=0
  8. files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb2.csv',recursive=True) #searches all files in folder
  9. print(files)
  10.  
  11. for file in files:
  12.     if count==0: #to only go through the first file, instead of all files in the folder
  13.         csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
  14.         for row in csvfile['URL']:
  15.             print('row: ' + row)
  16.             with urlopen(str(row)) as response:
  17.                 doc = ET.parse(response)  
  18.                 root = doc.getroot()
  19.                 namespaces = {  # Manually extracted from the XML file, but there could be code written to automatically do that.
  20.             "zs": "http://www.loc.gov/zing/srw/",
  21.             "": "http://www.loc.gov/MARC21/slim",
  22.                 }
  23.             datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield"  # XPath
  24.             datafield_attribute_filters = [ #which fields to extract
  25.             {
  26.             "tag": "100", #author
  27.             "ind1": "1",
  28.             "ind2": " ",
  29.             }]
  30.             #datafield_attribute_filters = []  # Decomment this line to clear filters (and process each datafield node)
  31.             aut = []
  32.             for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
  33.                 if datafield_attribute_filters:
  34.                     skip_node = True
  35.                     for attr_dict in datafield_attribute_filters:
  36.                         for k, v in attr_dict.items():
  37.                             if datafield_node.get(k) != v:
  38.                                 break
  39.                         else:
  40.                             skip_node = False
  41.                             break
  42.                     if skip_node:
  43.                         continue
  44.                 for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
  45.                     aut.append(subfield_node.text) #this gets the author name and title
  46.                    
  47.             print(aut)
  48.         count+=1
Add Comment
Please, Sign In to add comment