Advertisement
Guest User

Untitled

a guest
Jan 18th, 2020
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.87 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import os
  5. import sys
  6. import optparse
  7. import csv
  8.  
  9. import gzip
  10. import time
  11. import xml.parsers.expat
  12.  
  13. import requests
  14.  
  15. from bs4 import BeautifulSoup
  16.  
  17. META_DIR = './meta/'
  18. HTML_DIR = './html/'
  19.  
  20. __version__ = 'r4 (2017/11/28)'
  21.  
  22. parsed_data = dict()
  23. open_tag = ""
  24. text_list = []
  25.  
  26.  
  27. def parse_command_line(argv):
  28. """Command line options parser for the script
  29. """
  30. usage = "Usage: %prog [options] <CSV input file>"
  31.  
  32. parser = optparse.OptionParser(usage=usage)
  33. parser.add_option("-o", "--outfile", action="store",
  34. type="string", dest="outfile", default="archive-out.csv",
  35. help="Output CSV filename (default: 'archive-out.csv')")
  36. parser.add_option("--meta", action="store",
  37. type="string", dest="meta", default=META_DIR,
  38. help="Meta files directory (default: '{:s}')".format(META_DIR))
  39. parser.add_option("--html", action="store",
  40. type="string", dest="html", default=HTML_DIR,
  41. help="HTML files directory (default: '{:s}')".format(HTML_DIR))
  42. parser.add_option("-s", "--skip", action="store",
  43. type="int", dest="skip", default=0,
  44. help="Skip rows (default: 0)")
  45.  
  46. return parser.parse_args(argv)
  47.  
  48.  
  49. # 3 handler functions
  50. def start_element(name, attrs):
  51. global open_tag
  52. #print('Start element:', name, attrs)
  53. open_tag = name
  54.  
  55.  
  56. def end_element(name):
  57. global open_tag, text_list
  58. #print('End element:', name)
  59. if name == open_tag:
  60. #print(open_tag, "==>", '\n'.join(text_list))
  61. if open_tag not in parsed_data:
  62. parsed_data[open_tag] = []
  63. parsed_data[open_tag].append('\n'.join(text_list))
  64. text_list = []
  65. open_tag = ""
  66. pass
  67.  
  68.  
  69. def char_data(data):
  70. #print('Character data:', repr(data))
  71. if data.strip() != "":
  72. text_list.append(data.strip())
  73. pass
  74.  
  75. if __name__ == "__main__":
  76.  
  77. print("{:s} - {:s}\n".format(os.path.basename(sys.argv[0]), __version__))
  78. (options, args) = parse_command_line(sys.argv)
  79. if len(args) < 2:
  80. print("Usage: {:s} [options] <CSV input file>".format(os.path.basename(sys.argv[0])))
  81. sys.exit(-1)
  82.  
  83. print("Parse all meta files to extract all possible fields, please wait...")
  84. count = 0
  85. columns = set()
  86. f = open(args[1])
  87. reader = csv.DictReader(f)
  88. for i, r in enumerate(reader):
  89. if i < options.skip:
  90. continue
  91. count += 1
  92. _id = r['identifier']
  93. file_name = os.path.join(options.meta, _id + "_meta.xml")
  94. if not os.path.isfile(file_name):
  95. file_name += ".gz"
  96.  
  97. parsed_data = dict()
  98.  
  99. if os.path.isfile(file_name):
  100. if file_name.endswith('.gz'):
  101. fxml = gzip.open(file_name)
  102. else:
  103. fxml = open(file_name, 'r')
  104. xmlstr = fxml.read()
  105. fxml.close()
  106.  
  107. p = xml.parsers.expat.ParserCreate()
  108.  
  109. p.StartElementHandler = start_element
  110. p.EndElementHandler = end_element
  111. p.CharacterDataHandler = char_data
  112. try:
  113. p.Parse(xmlstr)
  114. #for t in parsed_data:
  115. # parsed_data[t] = '|'.join(parsed_data[t])
  116. columns.update(list(parsed_data))
  117. except:
  118. print("WARN: Cannot parse {0}".format(file_name))
  119. f.close()
  120.  
  121. print("Total: {:d}, Meta fields count: {:d}".format(count, len(columns)))
  122.  
  123. try:
  124. if os.path.isfile(options.outfile):
  125. o = open(options.outfile, encoding='utf-8', newline='')
  126. reader = csv.DictReader(o)
  127. columns = reader.fieldnames
  128. o.close()
  129. o = open(options.outfile, "at", encoding='utf-8', newline='')
  130. writer = csv.DictWriter(o, fieldnames=sorted(list(columns), extrasaction='ignore'))
  131. else:
  132. o = open(options.outfile, "wt", encoding='utf-8', newline='')
  133. writer = csv.DictWriter(o, fieldnames=sorted(list(columns)) + ['text'], extrasaction='ignore')
  134. writer.writeheader()
  135. except:
  136. if os.path.isfile(options.outfile):
  137. o = open(options.outfile)
  138. reader = csv.DictReader(o)
  139. columns = reader.fieldnames
  140. o.close()
  141. o = open(options.outfile, "ab")
  142. writer = csv.DictWriter(o, fieldnames=sorted(list(columns)), extrasaction='ignore')
  143. else:
  144. o = open(options.outfile, "wb")
  145. writer = csv.DictWriter(o, fieldnames=sorted(list(columns)) + ['text'], extrasaction='ignore')
  146. writer.writeheader()
  147.  
  148. count = 0
  149. f = open(args[1])
  150. reader = csv.DictReader(f)
  151. for i, r in enumerate(reader):
  152. if i < options.skip:
  153. continue
  154. count += 1
  155. _id = r['identifier']
  156. print("#{:d}: {:s}".format(count, _id))
  157.  
  158. # Parse meta file to extract meta fields
  159. file_name = os.path.join(options.meta, _id + "_meta.xml")
  160. if not os.path.isfile(file_name):
  161. file_name += ".gz"
  162.  
  163. parsed_data = dict()
  164.  
  165. if os.path.isfile(file_name):
  166. if file_name.endswith('.gz'):
  167. fxml = gzip.open(file_name)
  168. else:
  169. fxml = open(file_name, 'r')
  170. xmlstr = fxml.read()
  171. fxml.close()
  172.  
  173. p = xml.parsers.expat.ParserCreate()
  174.  
  175. p.StartElementHandler = start_element
  176. p.EndElementHandler = end_element
  177. p.CharacterDataHandler = char_data
  178. try:
  179. p.Parse(xmlstr)
  180. for t in parsed_data:
  181. parsed_data[t] = '|'.join(parsed_data[t]).encode('utf-8')
  182. except:
  183. print("WARN: Cannot parse '{0}'".format(file_name))
  184. continue
  185.  
  186. # Parse HTML file for closed-caption
  187. file_name = os.path.join(options.html, _id + ".html")
  188. if not os.path.isfile(file_name):
  189. file_name += ".gz"
  190.  
  191. if os.path.isfile(file_name):
  192. if file_name.endswith('.gz'):
  193. fhtml = gzip.open(file_name, 'r')
  194. else:
  195. fhtml = open(file_name, 'r')
  196. htmlstr = fhtml.read()
  197. fhtml.close()
  198.  
  199. soup = BeautifulSoup(htmlstr, 'html.parser')
  200. htmlstr = soup.prettify()
  201. soup = BeautifulSoup(htmlstr, 'html.parser')
  202. text = ""
  203. for a in soup.find_all('div', {'class': 'snipin nosel'}):
  204. text += a.text.strip()
  205.  
  206. parsed_data['text'] = text.encode('utf-8')
  207. writer.writerow(parsed_data)
  208.  
  209. f.close()
  210. o.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement