Advertisement
Guest User

Untitled

a guest
Jul 22nd, 2019
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.60 KB | None | 0 0
  1. import elasticsearch
  2. from elasticsearch.helpers import scan
  3. from nltk.tokenize import sent_tokenize
  4. import os
  5. import certifi
  6. import pandas as pd
  7. import numpy
  8. import requests
  9. import re
  10. import json
  11. from bs4 import BeautifulSoup
  12. import time
  13. start_time = time.time()
  14.  
  15. # Pulling case text
  16.  
  17. es_url = 'password'
  18. es = elasticsearch.Elasticsearch([es_url],
  19. #http_auth=(es_user, es_password),
  20. timeout=30,
  21. max_retries=2,
  22. use_ssl=True,
  23. verify_certs=True,
  24. ca_certs=certifi.where())
  25.  
  26. query = {
  27. "query":{
  28. "match_all":{}
  29. }, "_source":['title','text']
  30. #}, "_source": 'title'
  31. }
  32.  
  33. my_scan = scan(client=es, query=query, index='primary_docs', doc_type='case')
  34.  
  35. text = []
  36. counter = 0
  37. for x in my_scan:
  38. if counter >= 100:
  39. #if count >= 100000:
  40. break
  41. counter+=1
  42. text.append(x['_source']['text'])
  43.  
  44. slugs = []
  45. count = 0
  46. for entry in my_scan:
  47. if count >= 100:
  48. #if count >= 100000:
  49. break
  50. count+=1
  51. slugs.append(entry['_id'])
  52.  
  53. df = pd.DataFrame()
  54. df["slug"] = slugs
  55. df['text'] = text
  56. #print(df)
  57.  
  58. print("got slugs + text")
  59. #
  60. #def get_soup(slug):
  61. # try:
  62. # url = 'https://law.casetext.com/doc/{}/html'.format(slug)
  63. # res = requests.get(url)
  64. # html = res.content
  65. # soup = BeautifulSoup(html, 'lxml')
  66. # soup = soup.get_text(strip= True)
  67. # return soup
  68. # except Exception as e:
  69. # print("Failed to retrieve html for {}: {}.".format(slug, e))
  70. # return "brick"
  71. #
  72. #caseText = []
  73. #
  74. #for slug in df['slug']:
  75. # #caseText += [get_soup(slug).lower()]
  76. # caseText += [get_soup(slug)]
  77. # #print("slug added: ", slug, " index number: ", df['slug'].index(slug))
  78. #
  79. ##caseText = list(map(lambda x: x.get_text(strip = True)), caseText)
  80. #df['text'] = caseText
  81. #
  82. #print("got cases")
  83.  
  84.  
  85. # REPORTER CITE
  86. with open('reporter_citations.json') as json_file:
  87. reporters_dict = json.load(json_file)
  88.  
  89. repcites = list(reporters_dict.keys())
  90. repcites = [i.replace(".", "\.") for i in repcites]
  91. repcites = [i.replace("&", "\&") for i in repcites]
  92. repcites = [i.replace("(", "\(") for i in repcites]
  93. repcites = [i.replace(")", "\)") for i in repcites]
  94. #repcites_regex = '\d{1,4}( ' + " | ".join(repcites) + ' )\d{1,4}'
  95. #repcites_regex = '( ' + " | ".join(repcites) + ' )'
  96. repcites_regex = " | ".join(repcites)
  97.  
  98. #==============================================================================
  99. # final regex
  100. # final regex model
  101. # (see also| but see| But cf\.| see| contra) [a-zA-Z\s\.\,]{1,50}(\d{1,4}( A\.2d | A\.3d | U.S. )\d{1,4})(, \d{1,4} \(\d{1,4}\)| (\d{1,4}( A\.2d | A\.3d | U.S. )\d{1,4}))? \(.*?.*?\)
  102.  
  103. final_regex = '\(.*?.*?\)(see also| but see| But cf\.| see| contra) [a-zA-Z\s\.\,]{1,50}(\d{1,4}(' + repcites_regex + ')\d{1,4})(, \d{1,4} \(\d{1,4}\)| (\d{1,4}(' + repcites_regex + ')\d{1,4}))? \(.*?.*?\)'
  104. compiled_oneshot_regex = re.compile(final_regex)
  105.  
  106. countd = 0
  107. regex_result = []
  108. for case in df['text']:
  109. #print(df[df['text']==case].index.values.astype(int))
  110. print(countd)
  111. countd += 1
  112. try:
  113. regex_result.append(compiled_oneshot_regex.search(case).group(0))
  114.  
  115. except:
  116. regex_result.append("")
  117.  
  118. df['regex_results'] = regex_result
  119. print('got regex results')
  120.  
  121. #componenets
  122.  
  123. quote_1 = []
  124. signal_found = []
  125. quote_2 = []
  126.  
  127. count = 0
  128. for x in df['regex_results']:
  129. print(count)
  130. count += 1
  131. #print("type of x: ", x, "\n x: ", x)
  132. if x != "":
  133. q_s_q = x
  134. signal_regex = '(see also| but see| But cf\.| see| contra)'
  135. compiled_signal_regex = re.compile(signal_regex)
  136. signal = re.search(compiled_signal_regex, q_s_q).group(0)
  137. split = re.split(compiled_signal_regex, q_s_q)
  138. #print("split: ", split)
  139. q1 = split[0]
  140. quote_regex = '\(.*?.*?\)'
  141. compiled_quote_regex = re.compile(quote_regex)
  142. q2 = re.search(compiled_quote_regex, split[2]).group(0)
  143. #print("quote1: ", q1)
  144. #print("signal: ", signal)
  145. #print("quote2: ", q2)
  146. quote_1.append(q1)
  147. signal_found.append(signal)
  148. quote_2.append(q2)
  149.  
  150. else:
  151. quote_1.append("")
  152. signal_found.append("")
  153. quote_2.append("")
  154.  
  155. df['quote1'] = quote_1
  156. df['signal'] = signal_found
  157. df['quote2'] = quote_2
  158.  
  159.  
  160.  
  161.  
  162.  
  163.  
  164.  
  165.  
  166.  
  167.  
  168.  
  169.  
  170. df.to_csv("full_regex_test.csv")
  171. print("--- %s seconds ---" % (time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement