WorldTeacher

get api data

Apr 13th, 2022 (edited)
269
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.79 KB | None | 0 0
  1. import sys #import libraries
  2. from xml.etree import ElementTree as ET #parsing xml
  3. import glob #for testing with local files
  4. from urllib.request import urlopen #when trying to parse xml from url
  5. import pandas as pd#for csv
  6. import time
  7. import os
  8. #safepath='./extract/isbnlist/'
  9. filelist=glob.glob('./extract/Reihe A/Reihe*_extract.csv',recursive=True) #there is currently only one file, but later on, there will be multiple files
  10. print(filelist)
  11. for file in filelist:
  12.     print('currently looking at file: ' + file)
  13.     #read csv, make a list of all isbns
  14.     data=pd.read_csv(file, sep="\t",  encoding='utf8')
  15.     isbnlist=[]
  16.     for row in data['ISBN']:
  17.         isbnlist.append(row)
  18.         #print ('currently looking at row: ' + row)
  19.        
  20.         for isbn in isbnlist:
  21.             with urlopen('https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query=pica.isb%3D' + isbn + '+and+pica.bib%3D20735&maximumRecords=10&recordSchema=marcxml') as file:
  22.                 doc = ET.parse(file)  
  23.                 root = doc.getroot()
  24.                 namespaces = {  
  25.         "zs": "http://www.loc.gov/zing/srw/",
  26.         "": "http://www.loc.gov/MARC21/slim",
  27.         }
  28.         datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield"  # XPath
  29.         datafield_attribute_filters = [ #which fields to extract
  30.         {
  31.             "tag": "100", #author
  32.             "ind1": "1",
  33.             "ind2": " ",
  34.         }]
  35.         aut = []
  36.         for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
  37.             if datafield_attribute_filters:
  38.                 skip_node = True
  39.                 for attr_dict in datafield_attribute_filters:
  40.                     for k, v in attr_dict.items():
  41.                         if datafield_node.get(k) != v:
  42.                             break
  43.                     else:
  44.                         skip_node = False
  45.                         break
  46.                 if skip_node:
  47.                     continue
  48.             for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
  49.                 aut.append(subfield_node.text) #this gets the author name
  50.         datafield_attribute_filters = [
  51.         {
  52.         "tag": "245", #title
  53.         "ind1": "1",
  54.         "ind2": "0",
  55.         }]
  56.         title = []
  57.         for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
  58.             if datafield_attribute_filters:
  59.                 skip_node = True
  60.                 for attr_dict in datafield_attribute_filters:
  61.                     for k, v in attr_dict.items():
  62.                         if datafield_node.get(k) != v:
  63.                             break
  64.                     else:
  65.                         skip_node = False
  66.                         break
  67.                 if skip_node:
  68.                     continue
  69.             for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
  70.                 title.append(subfield_node.text) #this gets the title
  71.         subtitle = []
  72.         for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
  73.             if datafield_attribute_filters:
  74.                 skip_node = True
  75.                 for attr_dict in datafield_attribute_filters:
  76.                     for k, v in attr_dict.items():
  77.                         if datafield_node.get(k) != v:
  78.                             break
  79.                     else:
  80.                         skip_node = False
  81.                         break
  82.                 if skip_node:
  83.                     continue
  84.             for subfield_node in datafield_node.iterfind("./subfield[@code='b']", namespaces=namespaces):
  85.                 subtitle.append(subfield_node.text)    
  86.         datafield_attribute_filters = [
  87.         {
  88.         "tag": "024",
  89.         "ind1": "3",
  90.         "ind2": " ",
  91.         }]
  92.         isbnextract = [] #this gets the isbn, currently errors out if there are multiple isbns
  93.         for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
  94.             if datafield_attribute_filters:
  95.                 skip_node = True
  96.                 for attr_dict in datafield_attribute_filters:
  97.                     for k, v in attr_dict.items():
  98.                         if datafield_node.get(k) != v:
  99.                             break
  100.                     else:
  101.                         skip_node = False
  102.                         break
  103.                 if skip_node:
  104.                     continue
  105.             for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
  106.                 isbnextract.append(subfield_node.text)
  107.        
  108.     df=pd.DataFrame({'author': aut, 'title': title, 'subtitle': subtitle, 'isbn': isbn})
  109.     #save as csv
  110.     df.to_csv("api2.csv")
Add Comment
Please, Sign In to add comment