Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import elasticsearch
- from elasticsearch.helpers import scan
- from nltk.tokenize import sent_tokenize
- import os
- import certifi
- import pandas as pd
- import numpy
- import requests
- import re
- import json
- from bs4 import BeautifulSoup
- import time
- start_time = time.time()
- # Pulling case text
- es_url = 'password'
- es = elasticsearch.Elasticsearch([es_url],
- #http_auth=(es_user, es_password),
- timeout=30,
- max_retries=2,
- use_ssl=True,
- verify_certs=True,
- ca_certs=certifi.where())
- query = {
- "query":{
- "match_all":{}
- }, "_source":['title','text']
- #}, "_source": 'title'
- }
- my_scan = scan(client=es, query=query, index='primary_docs', doc_type='case')
- text = []
- counter = 0
- for x in my_scan:
- if counter >= 100:
- #if count >= 100000:
- break
- counter+=1
- text.append(x['_source']['text'])
- slugs = []
- count = 0
- for entry in my_scan:
- if count >= 100:
- #if count >= 100000:
- break
- count+=1
- slugs.append(entry['_id'])
- df = pd.DataFrame()
- df["slug"] = slugs
- df['text'] = text
- #print(df)
- print("got slugs + text")
- #
- #def get_soup(slug):
- # try:
- # url = 'https://law.casetext.com/doc/{}/html'.format(slug)
- # res = requests.get(url)
- # html = res.content
- # soup = BeautifulSoup(html, 'lxml')
- # soup = soup.get_text(strip= True)
- # return soup
- # except Exception as e:
- # print("Failed to retrieve html for {}: {}.".format(slug, e))
- # return "brick"
- #
- #caseText = []
- #
- #for slug in df['slug']:
- # #caseText += [get_soup(slug).lower()]
- # caseText += [get_soup(slug)]
- # #print("slug added: ", slug, " index number: ", df['slug'].index(slug))
- #
- ##caseText = list(map(lambda x: x.get_text(strip = True)), caseText)
- #df['text'] = caseText
- #
- #print("got cases")
- # REPORTER CITE
- with open('reporter_citations.json') as json_file:
- reporters_dict = json.load(json_file)
- repcites = list(reporters_dict.keys())
- repcites = [i.replace(".", "\.") for i in repcites]
- repcites = [i.replace("&", "\&") for i in repcites]
- repcites = [i.replace("(", "\(") for i in repcites]
- repcites = [i.replace(")", "\)") for i in repcites]
- #repcites_regex = '\d{1,4}( ' + " | ".join(repcites) + ' )\d{1,4}'
- #repcites_regex = '( ' + " | ".join(repcites) + ' )'
- repcites_regex = " | ".join(repcites)
- #==============================================================================
- # final regex
- # final regex model
- # (see also| but see| But cf\.| see| contra) [a-zA-Z\s\.\,]{1,50}(\d{1,4}( A\.2d | A\.3d | U.S. )\d{1,4})(, \d{1,4} \(\d{1,4}\)| (\d{1,4}( A\.2d | A\.3d | U.S. )\d{1,4}))? \(.*?.*?\)
- final_regex = '\(.*?.*?\)(see also| but see| But cf\.| see| contra) [a-zA-Z\s\.\,]{1,50}(\d{1,4}(' + repcites_regex + ')\d{1,4})(, \d{1,4} \(\d{1,4}\)| (\d{1,4}(' + repcites_regex + ')\d{1,4}))? \(.*?.*?\)'
- compiled_oneshot_regex = re.compile(final_regex)
- countd = 0
- regex_result = []
- for case in df['text']:
- #print(df[df['text']==case].index.values.astype(int))
- print(countd)
- countd += 1
- try:
- regex_result.append(compiled_oneshot_regex.search(case).group(0))
- except:
- regex_result.append("")
- df['regex_results'] = regex_result
- print('got regex results')
- #componenets
- quote_1 = []
- signal_found = []
- quote_2 = []
- count = 0
- for x in df['regex_results']:
- print(count)
- count += 1
- #print("type of x: ", x, "\n x: ", x)
- if x != "":
- q_s_q = x
- signal_regex = '(see also| but see| But cf\.| see| contra)'
- compiled_signal_regex = re.compile(signal_regex)
- signal = re.search(compiled_signal_regex, q_s_q).group(0)
- split = re.split(compiled_signal_regex, q_s_q)
- #print("split: ", split)
- q1 = split[0]
- quote_regex = '\(.*?.*?\)'
- compiled_quote_regex = re.compile(quote_regex)
- q2 = re.search(compiled_quote_regex, split[2]).group(0)
- #print("quote1: ", q1)
- #print("signal: ", signal)
- #print("quote2: ", q2)
- quote_1.append(q1)
- signal_found.append(signal)
- quote_2.append(q2)
- else:
- quote_1.append("")
- signal_found.append("")
- quote_2.append("")
- df['quote1'] = quote_1
- df['signal'] = signal_found
- df['quote2'] = quote_2
- df.to_csv("full_regex_test.csv")
- print("--- %s seconds ---" % (time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement