Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import glob
- import time
- from urllib.request import urlopen
- import pandas as pd
- import xml.etree.ElementTree as ET
- count=0
- files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb2.csv',recursive=True) #searches all files in folder
- print(files)
- for file in files:
- if count==0: #to only go through the first file, instead of all files in the folder
- csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
- for row in csvfile['URL']:
- print('row: ' + row)
- with urlopen(str(row)) as response:
- doc = ET.parse(response)
- root = doc.getroot()
- namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
- "zs": "http://www.loc.gov/zing/srw/",
- "": "http://www.loc.gov/MARC21/slim",
- }
- datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
- datafield_attribute_filters = [ #which fields to extract
- {
- "tag": "100", #author
- "ind1": "1",
- "ind2": " ",
- }]
- #datafield_attribute_filters = [] # Decomment this line to clear filters (and process each datafield node)
- aut = []
- for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
- if datafield_attribute_filters:
- skip_node = True
- for attr_dict in datafield_attribute_filters:
- for k, v in attr_dict.items():
- if datafield_node.get(k) != v:
- break
- else:
- skip_node = False
- break
- if skip_node:
- continue
- for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
- aut.append(subfield_node.text) #this gets the author name and title
- print(aut)
- count+=1
Add Comment
Please, Sign In to add comment