Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys #import libraries
- from xml.etree import ElementTree as ET #parsing xml
- import glob #for testing with local files
- from urllib.request import urlopen #when trying to parse xml from url
- import pandas as pd#for csv
- import time
- import os
- #safepath='./extract/isbnlist/'
- filelist=glob.glob('./extract/Reihe A/Reihe*_extract.csv',recursive=True) #there is currently only one file, but later on, there will be multiple files
- print(filelist)
- for file in filelist:
- print('currently looking at file: ' + file)
- #read csv, make a list of all isbns
- data=pd.read_csv(file, sep="\t", encoding='utf8')
- isbnlist=[]
- for row in data['ISBN']:
- isbnlist.append(row)
- #print ('currently looking at row: ' + row)
- for isbn in isbnlist:
- with urlopen('https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query=pica.isb%3D' + isbn + '+and+pica.bib%3D20735&maximumRecords=10&recordSchema=marcxml') as file:
- doc = ET.parse(file)
- root = doc.getroot()
- namespaces = {
- "zs": "http://www.loc.gov/zing/srw/",
- "": "http://www.loc.gov/MARC21/slim",
- }
- datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
- datafield_attribute_filters = [ #which fields to extract
- {
- "tag": "100", #author
- "ind1": "1",
- "ind2": " ",
- }]
- aut = []
- for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
- if datafield_attribute_filters:
- skip_node = True
- for attr_dict in datafield_attribute_filters:
- for k, v in attr_dict.items():
- if datafield_node.get(k) != v:
- break
- else:
- skip_node = False
- break
- if skip_node:
- continue
- for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
- aut.append(subfield_node.text) #this gets the author name
- datafield_attribute_filters = [
- {
- "tag": "245", #title
- "ind1": "1",
- "ind2": "0",
- }]
- title = []
- for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
- if datafield_attribute_filters:
- skip_node = True
- for attr_dict in datafield_attribute_filters:
- for k, v in attr_dict.items():
- if datafield_node.get(k) != v:
- break
- else:
- skip_node = False
- break
- if skip_node:
- continue
- for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
- title.append(subfield_node.text) #this gets the title
- subtitle = []
- for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
- if datafield_attribute_filters:
- skip_node = True
- for attr_dict in datafield_attribute_filters:
- for k, v in attr_dict.items():
- if datafield_node.get(k) != v:
- break
- else:
- skip_node = False
- break
- if skip_node:
- continue
- for subfield_node in datafield_node.iterfind("./subfield[@code='b']", namespaces=namespaces):
- subtitle.append(subfield_node.text)
- datafield_attribute_filters = [
- {
- "tag": "024",
- "ind1": "3",
- "ind2": " ",
- }]
- isbnextract = [] #this gets the isbn, currently errors out if there are multiple isbns
- for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
- if datafield_attribute_filters:
- skip_node = True
- for attr_dict in datafield_attribute_filters:
- for k, v in attr_dict.items():
- if datafield_node.get(k) != v:
- break
- else:
- skip_node = False
- break
- if skip_node:
- continue
- for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
- isbnextract.append(subfield_node.text)
- df=pd.DataFrame({'author': aut, 'title': title, 'subtitle': subtitle, 'isbn': isbn})
- #save as csv
- df.to_csv("api2.csv")
Add Comment
Please, Sign In to add comment