Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import struct
- import mmh3
- import uuid
- from datetime import datetime
- import os
- from OCR import *
- from SpellCheck import *
- from langdetect import detect_langs, detect
- from datetime import datetime
- import time
- import requests
- from sys import exit
- from pyPdf import PdfFileReader
- import json
- import ast
- ORIGINAL_DIRECTORY=0
- FILEPATH_DIRECTORY=0
- GUI=0
- ROOT=0
- API_ENDPOINT="192.168.133.110"
- def replacePunctuations(str):
- #`~!@#$%^&*()_-+={[}]|\:;"'<,>.?/
- str=str.replace("`"," ")
- str=str.replace("~"," ")
- str=str.replace("!"," ")
- str=str.replace("@"," ")
- str=str.replace("#"," ")
- str=str.replace("$"," ")
- str=str.replace("%"," ")
- str=str.replace("^"," ")
- str=str.replace("&"," ")
- str=str.replace("*"," ")
- str=str.replace("("," ")
- str=str.replace(")"," ")
- str=str.replace("-"," ")
- str=str.replace("_"," ")
- str=str.replace("="," ")
- str=str.replace("+"," ")
- str=str.replace("["," ")
- str=str.replace("{"," ")
- str=str.replace("]"," ")
- str=str.replace("}"," ")
- str=str.replace("\\"," ")
- str=str.replace("|"," ")
- str=str.replace(";"," ")
- str=str.replace(":"," ")
- str=str.replace("'"," ")
- str=str.replace('"'," ")
- str=str.replace(","," ")
- str=str.replace("<"," ")
- str=str.replace("."," ")
- str=str.replace(">"," ")
- str=str.replace("/"," ")
- str=str.replace("?"," ")
- return str
- def generate_uuid(timestamp, *vals):
- """Generates a UUID from the supplied timestamp and unique values.
- The left half of the UUID is the unix time representation of timestamp
- (milliseconds since the epoch). The right half is a murmur3 hash of the
- string representation of each of vals, concatenated together with a ';'
- character. It is important to always pass the value fields in the same order
- to get a repeatable UUID result.
- Sample usage:
- >>> generate_uuid(datetime.utcnow(), "555-1212", "555-6623")
- UUID('0000014b-8417-1058-b1b2-7118ad5be2f0')
- """
- mmh = mmh3.hash_bytes(';'.join(map(str, vals)))
- ts = long((timestamp - datetime.utcfromtimestamp(0)).total_seconds()) * 1000
- return uuid.UUID(bytes=struct.pack('>Q', ts) + mmh[:8])
- def setOriginalDirectory(str):
- try:
- global ORIGINAL_DIRECTORY
- ORIGINAL_DIRECTORY=str
- except:
- print("Fatal Error Setting the Original Directory")
- printToConsole("////Fatal Error Setting the Original Directory////")
- exit()
- def setFilePathDirectory(str):
- global FILEPATH_DIRECTORY
- try:
- FILEPATH_DIRECTORY=str
- except:
- print("Error setting the filepath directory, setting to Desktop/PDFs. Please put PDFs into that folder")
- printToConsole("////Error setting the filepath directory, setting to Desktop/PDFs. Please put PDFs into that folder////")
- FILEPATH_DIRECTORY="/home/jsi/Desktop/PDFs"
- def setGUI(gui, root):
- try:
- global GUI, ROOT
- GUI=gui
- ROOT=root
- except:
- print("Fatal error setting gui or root references")
- printToConsole("////Fatal error setting gui or root references////")
- exit()
- def createFilePathDirectories(optionSaveImages=False,optionSaveSessions=False):
- try:
- os.chdir(FILEPATH_DIRECTORY)
- if optionSaveSessions:
- if not os.path.exists("OutputData"):
- os.makedirs("OutputData")
- os.chdir("./OutputData")
- if not os.path.exists("IndividualData"):
- os.makedirs("IndividualData")
- os.chdir(FILEPATH_DIRECTORY)
- if optionSaveImages:
- if not os.path.exists("OutputData"):
- os.makedirs("OutputData")
- os.chdir("./OutputData")
- if not os.path.exists("PDFjpegs"):
- os.makedirs("PDFjpegs")
- except:
- print("Error creating file directories, possibility of fatal error")
- printToConsole("////Error creating file directories, possibility of fatal error////")
- def changeDirectory(str=FILEPATH_DIRECTORY):
- os.chdir(FILEPATH_DIRECTORY)
- try:
- os.chdir(str)
- except:
- os.chdir(FILEPATH_DIRECTORY)
- print("Error changing Directory")
- printToConsole("////Error Changing Directory////")
- def sendToServer(payload, optionConnectToServer=False):
- if optionConnectToServer:
- link="http://10.0.2.2:9092/CollectionExplorer/Collections/sessionSolr/api/sessions/"
- try:
- print("here")
- temp=[payload]
- print (json.dumps(temp))
- print("yo")
- r=requests.post(link,json=temp)
- print("now")
- print r.status_code
- printToConsole("Server Response Code: "+str(r.status_code)+" ")
- except:
- print("Error connecting to server")
- printToConsole("////Error Connection to Server////")
- else:
- print("Not Connecting to Server")
- printToConsole("////Not Connecting to Server////")
- def createFullTextFile():
- try:
- changeDirectory("./OutputData")
- #wipes(writes) file, then appends to
- fullTextFile=open("Fulltext.txt","w")
- fullTextFile=open("Fulltext.txt","a")
- fullTextFile.close()
- return fullTextFile
- except:
- print("Error creating full text, possibility of fatal error")
- printToConsole("////Error creating full text, possibility of fatal error////")
- return 0
- def createInstanceTextFile(filename):
- try:
- changeDirectory("./OutputData/IndividualData")
- invTextFile=open(filename+".txt","w")
- invTextFile.close()
- return invTextFile
- except:
- print("Error creating instance text, possibility of fatal error")
- printToConsole("////Error creating instance text, possibility of fatal error////")
- return 0
- def openFullTextFile():
- try:
- changeDirectory("./OutputData")
- fullTextFile=open("Fulltext.txt","a")
- return fullTextFile
- except:
- print("Error opening full text file, possibility of fatal error")
- printToConsole("////Error opening full text file, possibility of fatal error////")
- return fullTextFile
- def openInvTextFile(filename):
- try:
- changeDirectory("./OutputData/IndividualData")
- invTextFile=open(filename,"w")
- return invTextFile
- except:
- print("Error opening instance text file, possibility of fatal error")
- printToConsole("////Error opening instance text file, possibility of fatal error////")
- return instanceTextFile
- def writeToFiles(str,fullTextFile=None,invTextFile=None):
- try:
- str=str.encode("utf-8")
- except:
- print("Error encoding string to utf-8, attempting to encode in 'ascii'")
- printToConsole("////Error encoding string to utf-8, attempting to encode in 'ascii'////")
- try:
- str=str.decode("utf-8")
- str=str.encode("ascii","ignore")
- except:
- print("Error encoding string to ascii, possibility of fatal error")
- printToConsole("////Error encoding string to ascii, possibility of fatal error////")
- print(str)
- printToConsole(str)
- try:
- if not isinstance(fullTextFile, int) and not fullTextFile==None:
- if not fullTextFile.closed:
- fullTextFile.write(str+"\n")
- if not isinstance(invTextFile, int) and not invTextFile==None:
- if not invTextFile.closed:
- invTextFile.write(str+"\n")
- except:
- print("Error writing to files")
- printToConsole("////Error writing to files////")
- def createContextDictionary(userDict):
- try:
- os.chdir(ORIGINAL_DIRECTORY)
- os.chdir("./LangDictionaries")
- contextDictFile=open("ContextDict.txt","w")
- for word in userDict:
- contextDictFile.write(word.encode("utf-8")+"\n")
- contextDictFile.close()
- os.chdir(ORIGINAL_DIRECTORY)
- printToConsole("////Context Dictionary Successfully Created////")
- except:
- print("Error creating context dictionary")
- printToConsole("////Error creating context dictionary////")
- def buildUserDict(instanceText):
- userWords={}
- userDict={}
- try:
- printToConsole("////Building Context Dictionary////")
- for item in instanceText:
- print item
- item=replacePunctuations(item.lower())
- items=item.split()
- for word in items:
- if not word.isdigit():
- word=word.strip(' ')
- corrWord=correction(word)
- if wordExists(word):
- userDict[word]=1
- elif word.lower() in userWords:
- userWords[word]+=1
- if userWords[word]>=3:
- userDict[word]=1
- else:
- userWords[word]=1
- createContextDictionary(userDict)
- except:
- print("Error building context dictionary")
- printToConsole("////Error building context dictionary////")
- def writeSessionData(sessionData,fullTextFile,invTextFile,optionSaveSessions=False):
- try:
- if optionSaveSessions:
- fullTextFile=openFullTextFile()
- invTextFile=openInvTextFile(sessionData["products"]["filename"])
- writeToFiles("{",fullTextFile,invTextFile)
- for item in sessionData:
- if isinstance(sessionData[item],dict):
- writeToFiles("\t'"+item+"' : [{",fullTextFile,invTextFile)
- for smallerItem in sessionData[item]:
- if isinstance(sessionData[item][smallerItem],list):
- writeToFiles("\t\t'"+smallerItem+"' : [{",fullTextFile,invTextFile)
- for smallestItem in sessionData[item][smallerItem]:
- writeToFiles("\t\t\t'"+smallestItem+"',",fullTextFile,invTextFile)
- writeToFiles("\t\t}],",fullTextFile,invTextFile)
- else:
- writeToFiles("\t\t'"+smallerItem+"' : '"+sessionData[item][smallerItem]+"',",fullTextFile,invTextFile)
- writeToFiles("\t}],",fullTextFile,invTextFile)
- else:
- writeToFiles("\t'"+item+"' : '"+sessionData[item]+"',",fullTextFile,invTextFile)
- writeToFiles("},",fullTextFile,invTextFile)
- if not isinstance(fullTextFile, int):
- fullTextFile.close()
- if not isinstance(invTextFile, int):
- invTextFile.close()
- printToConsole("////Session Data Successfully Written////")
- except:
- print("Error write Session Data to files")
- printToConsole("////Error write Session Data to files////")
- def createSessionData(sessionguid,sessionIndex,filename,instanceText,language):
- sessionData={}
- try:
- sessionData["sessionguid"]=sessionguid.urn[9:]
- sessionData["starttime"]=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+"+00:00"
- #sessionData["casename"]="Offnet API Loader Test #"+str(sessionIndex)
- sessionData["casename"]="@MultiContentPreviewTestCase"
- sessionData["targetname"]="@MultiContentPreviewTestTarget"
- sessionData["linename"]="@MultiContentPreviewTestLine"
- sessionData["sourcesystemname"]="Sessions API"
- sessionData["sessiontype"]="Telephony"
- sessionData["isevidence"]=False
- #sessionData["languages"]=[language]
- #sessionData["products"]={}
- #sessionData["products"]["filename"]=filename+".txt"
- #sessionData["products"]["contenttype"]="text/plain"
- #sessionData["products"]["data"]=[]
- sessionData["fulltext"]=[]
- for item in instanceText:
- sessionData["fulltext"].append(item)
- printToConsole("////Session Data Successfully Created////")
- return sessionData
- except:
- print("Error creating session data")
- printToConsole("////Error creating session data////")
- return sessionData
- def spellCheck(instanceText,language="eng",optionSpellCheck=False,optionContextualSpellCheck=False):
- try:
- x=0
- if optionSpellCheck or optionContextualSpellCheck:
- for item in instanceText:
- instanceText[x]=replacePunctuations(item)
- item=instanceText[x]
- items=item.split()
- y=0
- for word in items:
- origWord=word
- word=word.lower()
- length=len(items)
- corrWord=word.lower()
- if not word.isdigit() and len(word)>=2:
- #Finding the Correct word, if it exists
- if wordExists(word,language) and optionSpellCheck:
- corrWord=word
- elif wordExists(word,"con") and optionContextualSpellCheck:
- corrWord=word
- else:
- if optionContextualSpellCheck:
- corrWord=correction(word,"con")
- if optionSpellCheck:
- if word==corrWord:
- corrWord=correction(word,language)
- #Replacing the worse
- if length==1:
- instanceText[x]=instanceText[x].replace(origWord,corrWord.title())
- elif y==0:
- instanceText[x]=instanceText[x].replace(origWord+" ",corrWord.title()+" ",1)
- elif y==length-1:
- tmpList = instanceText[x].rsplit(origWord, 1)
- instanceText[x]= corrWord.title().join(tmpList)
- else:
- instanceText[x]=instanceText[x].replace(" "+origWord+" "," "+corrWord.title()+" ")
- y+=1
- x+=1
- printToConsole("////Session Data Successfully Spell Checked////")
- return instanceText
- except:
- print("Error applying spell check on instance text")
- printToConsole("////Error applying spell check on instance text////")
- return instanceText
- def printToConsole(str):
- try:
- GUI.printToConsole(str)
- ROOT.update()
- except:
- print("Error printing to Console")
- def checkPDFValidity(str):
- try:
- changeDirectory(FILEPATH_DIRECTORY)
- doc = PdfFileReader(file(str+".pdf", "rb"))
- if doc.numPages>0:
- return True
- else:
- return False
- except AssertionError:
- print("File is not a valid pdf (AssertionError)")
- printToConsole("////File is not a valid pdf////")
- except IOError:
- print("File is not a valid pdf (IOError)")
- printToConsole("////File is not a valid pdf////")
- except:
- print("Error checking validity of file")
- printToConsole("////Error checking validity of file////")
- def setAPIconfig(apiEP,credUsername,credPassword):
- global API_ENDPOINT,CRED_USERNAME,CRED_PASSWORD
- API_ENDPOINT=apiEP
- CRED_USERNAME=credUsername
- CRED_PASSWORD=credPassword
- def getFullTextSessionData(fullTextFile):
- changeDirectory("./OutputData")
- fullTextFile=open('Fulltext.txt','r')
- sessionData=fullTextFile.read()
- r=(sessionData)
- print r
- return r[0]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement