Advertisement
Guest User

Untitled

a guest
Jan 17th, 2017
290
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.17 KB | None | 0 0
  1. # #this is no the live version. It contains dummy password. This is for testing only. this is connected to cron job
  2.  
  3. #todo
  4. #add ssh
  5.  
  6. import requests, bs4, sys, webbrowser, html2text, os , PyPDF2, urllib2, smtplib, re, json
  7. from email.MIMEMultipart import MIMEMultipart
  8. from email.MIMEText import MIMEText
  9. import mechanize
  10. import cookielib
  11.  
  12.  
  13. #uncomment these 2 lines of code if you get the below error. Some unicode encoding stuff
  14. #UnicodeEncodeError: 'ascii' codec can't encode character u'\ufeff' in position 0: ordinal not in range(128)
  15. reload(sys)
  16. sys.setdefaultencoding('utf8')
  17.  
  18. stubFilename='carIdHashTable.json'
  19. queryStringStubForTucson='http://tucson.craigslist.org/search/cto?'
  20. queryStringForViewMatches='http://www.pof.com/viewmatches.aspx?agelow=23&agehigh=99&miles=10&contacted=2&cmdSearch=Refine+Matches'
  21. firstQueryString='http://www.pof.com/'
  22. numberOfGoogleResults=1000
  23. stubMessage='Hey, nice profile. Must say you have a very nice smile. Are you from Tucson originally?'
  24. startValue=1
  25. stubUrlForPof='http://www.pof.com/'
  26. stubUrlForTucsonCLInnerpages='http://tucson.craigslist.org/'
  27. stubUrlForPhxCLInnerpages='http://phoenix.craigslist.org/'
  28. username=""
  29. #the values can be manual, automatic, or both
  30. transmission="both"
  31. pwd=""
  32. fromaddr="mithunpaul08@gmail.com"
  33. toaddr="mithunpaul08@gmail.com"
  34. #toaddr="jchebet@email.arizona.edu"
  35. subjectForEmail= "Today's details of the used cars in tucson/phoenix area you asked for"
  36. carbonCopy = "mithunpaul08@gmail.com"
  37. #if on laptop dont switch path. This is required because cron runs as a separate process in a separate directory in chung
  38. #turn this to true, if pushing to run on chung.cs.arizona.edu
  39. isRunningOnServer=False;
  40. firstTimeRun=False;
  41.  
  42.  
  43. if(firstTimeRun):
  44. bodyOfEmail="Hi, \n Here is a list of all the cars found today in Craigslist. This is the very first email of craigslist scraping for used cars. Tomorrow onwards you will be shown only new hits that were not sent today. These are the parameters used for this query:\n\n"
  45. else:
  46. bodyOfEmail="Hi,\n So the results you see below are what were newly found today. Everything else is same as what was sent yesterday. \nThese are the parameters used for this query:\n\n"
  47.  
  48.  
  49. path = "/home/mithunpaul/allResearch/clscraper/main/src/"
  50. #pathonLaptop
  51. #path = "/home/mithunpaul/allResearch/clscraper/main/src/"
  52.  
  53. #toget to:email id and my gmail password from command line
  54. if(len(sys.argv)>1):
  55. username=sys.argv[1]
  56. pwd = sys.argv[2]
  57. #print("username:"+username)
  58. # print("pwd:" + pwd)
  59.  
  60. else:
  61. print("not enough arguments in Command Line. Exiting.")
  62. sys.exit(1)
  63.  
  64.  
  65.  
  66. class myCar:
  67. min_price = ""
  68. max_price =""
  69. auto_make_model=""
  70. min_auto_year=""
  71. max_auto_year=""
  72. min_auto_miles=''
  73. max_auto_miles=''
  74. auto_title_status=''
  75. auto_transmission=''
  76.  
  77.  
  78.  
  79. #"Search Query attributes used to build the query string"
  80. def fillSearchQueryAttributes(queryCar):
  81. queryCar.min_price = "1"
  82. queryCar.max_price ="6000"
  83. queryCar.auto_make_model="honda+%7C+toyota"
  84. queryCar.min_auto_year="2005"
  85. queryCar.max_auto_year="2016"
  86. queryCar.min_auto_miles='1'
  87. queryCar.max_auto_miles='110000'
  88. queryCar.auto_title_status='1'
  89. queryCar.auto_transmission='1'
  90.  
  91. def createQueryObject(queryStringStubToBuild, carObject):
  92. queryStringToSearch = str(queryStringStubToBuild)+"sort=priceasc&min_price="+carObject.min_price+\
  93. "&max_price="+carObject.max_price+\
  94. "&auto_make_model="+carObject.auto_make_model+\
  95. "&min_auto_year="+carObject.min_auto_year+\
  96. "&max_auto_year="+carObject.max_auto_year+\
  97. "&min_auto_miles="+carObject.min_auto_miles+\
  98. "&max_auto_miles="+carObject.max_auto_miles+\
  99. "&auto_transmission="+carObject.auto_transmission+\
  100. "&auto_title_status="+carObject.auto_title_status
  101. if (transmission=="both"):
  102. queryStringToSearch=queryStringToSearch+"&auto_transmission="+`2`
  103.  
  104. return queryStringToSearch
  105.  
  106. def sendEmail(listOfMyCars,carObject):
  107. finalMessageToSend=""
  108. if(listOfMyCars.__len__()==0):
  109. finalMessageToSend="hi, no new cars were found in today's search. Have a good day"
  110. else:
  111. queryResultsAsString="\n\n".join(listOfMyCars)
  112. bodyWithQueryDetails=createQueryObject(bodyOfEmail,carObject);
  113. bodyWithQueryDetailsreplacedAmbersand=bodyWithQueryDetails.replace("&", "\n")
  114. finalMessageToSend=bodyWithQueryDetailsreplacedAmbersand+"\n \nAnd the results are as follows:\n\n"+queryResultsAsString
  115. print("getting here at 32423")
  116.  
  117. msg = "\r\n".join([
  118. "From: "+fromaddr,
  119. "To: " + toaddr,
  120. "CC: " + carbonCopy,
  121. "Subject:"+subjectForEmail,
  122. "",
  123. finalMessageToSend
  124. ])
  125.  
  126. #print("getting here at 3687")
  127. server = smtplib.SMTP('smtp.gmail.com:587')
  128. server.ehlo()
  129. #print("getting here at 8637")
  130. server.starttls()
  131. #print("getting here at 52895")
  132. server.login(gmailUsername, gmailPwd)
  133. #print("getting here at 5498")
  134. server.sendmail(fromaddr, toaddr, msg)
  135. #print("getting here at 68468")
  136. server.quit()
  137. print("done sending email to:"+toaddr)
  138.  
  139.  
  140.  
  141.  
  142. def buildMessageBody(carObjectToBuildQuery):
  143. bodyOfEmail = "Hi, the details used for this query are as follows:"+carObjectToBuildQuery
  144.  
  145.  
  146.  
  147. def encodeAndwriteToOutputFile(textToWrite):
  148. target = open(stubFilename+'.txt', 'w+')
  149. target.write(html2text.html2text(textToWrite).encode('utf-8'))
  150. target.close()
  151.  
  152.  
  153. def writeToOutputFile(textToWrite):
  154. target = open(stubFilename+'.txt', 'w+')
  155. target.write(textToWrite);
  156. target.close()
  157.  
  158. def AdduidToHashtable(uniqueId, localhtToCheck):
  159. localhtToCheck[uniqueId] = 1
  160. print("length of hashtable inside checkAndadduidToHashtable is:"+`localhtToCheck.__len__()`)
  161. return localhtToCheck
  162.  
  163. def readFromJsonToHashtable(filename):
  164. # load from file:
  165. htMyTable={}
  166. with open(filename, 'r') as f:
  167. try:
  168. #print("inside child :length of hashtable that just came in is:"+`carIdHashTable.__len__()`)
  169. #carIdHashTable["test"] = 1
  170. # print("inside child :length of hashtable that just came in is:"+`carIdHashTable.__len__()`)
  171. htMyTable = json.load(f)
  172. #print("inside child :length of hashtable inside is:"+`htMyTable.__len__()`)
  173. #carIdHashTable=htMyTable
  174. # print("inside child :length of carIdHashTable inside is:"+`carIdHashTable.__len__()`)
  175. # if the file is empty the ValueError will be thrown
  176. except ValueError:
  177. carIdHashTable = {}
  178. return htMyTable
  179.  
  180.  
  181. def writeToFileAsJson(myhashTable, filename):
  182. # save to file:
  183. with open(filename, 'w+') as f:
  184. json.dump(myhashTable, f)
  185. f.close()
  186.  
  187. def parseGResults(myQS):
  188. try:
  189. #code from http://stackoverflow.com/questions/20039643/how-to-scrape-a-website-that-requires-login-first-with-python
  190. # Browser
  191. br = mechanize.Browser()
  192.  
  193. # Cookie Jar
  194. cj = cookielib.LWPCookieJar()
  195. br.set_cookiejar(cj)
  196.  
  197. # Browser options
  198. br.set_handle_equiv(True)
  199. br.set_handle_gzip(True)
  200. br.set_handle_redirect(True)
  201. br.set_handle_referer(True)
  202. br.set_handle_robots(False)
  203. br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
  204.  
  205. br.addheaders = [('User-agent', 'Chrome')]
  206.  
  207. # The site we will navigate into, handling it's session
  208. br.open(myQS)
  209.  
  210. # View available forms
  211. # for f in br.forms():
  212. # print f
  213.  
  214. # Select the second (index one) form (the first form is a search query box)
  215. br.select_form(nr=0)
  216.  
  217. # User credentials
  218. br.form['username'] = username
  219. br.form['password'] = pwd
  220.  
  221. # Login
  222. br.submit()
  223.  
  224. try:
  225. #note:queryStringForViewMatches already contains the clause: havent contacted before. You dont want to spam
  226. #someone you have already contacted and then get blocked
  227. url=br.open(queryStringForViewMatches)
  228. #url = urllib2.urlopen(queryStringToSearch)
  229. except urllib2.HTTPError, e:
  230. print('HTTPError = ' + str(e.code))
  231. except urllib2.URLError, e:
  232. print('URLError = ' + str(e.reason))
  233. except httplib.HTTPException, e:
  234. print('HTTPException')
  235. except Exception:
  236. import traceback
  237. print('generic exception: ' + traceback.format_exc())
  238. else:
  239. content = url.read()
  240.  
  241.  
  242. print("succesfully logged into pof")
  243. # parse the content into a format that soup understands
  244. soup = bs4.BeautifulSoup(content, "lxml")
  245. # for each of the hyperlinks in the page
  246. for link in soup.find_all('a'):
  247. #print(link)
  248. classResult = link.get('class')
  249. if (classResult != None):
  250. if ("mi" in classResult):
  251. # if the class exists, get the link, if its not null
  252. linkToNextPage = link.get('href')
  253. if (linkToNextPage != None):
  254. print("\n")
  255. profilePageUrl = stubUrlForPof + linkToNextPage
  256. #print(profilePageUrl)
  257. # once you get the link to the person'as profile, open and go into that page.
  258.  
  259.  
  260. try:
  261. br.open(profilePageUrl)
  262. #for f in br.forms():
  263. #print f
  264.  
  265. # Select the first form (the first form is the quick message form)
  266. br.select_form(nr=0)
  267.  
  268. # User credentials
  269. br.form['message'] = stubMessage
  270.  
  271.  
  272. # submit the text
  273. br.submit()
  274. print("sent message to "+profilePageUrl)
  275.  
  276. except urllib2.HTTPError, e:
  277. print('HTTPError = ' + str(e.code))
  278. except urllib2.URLError, e:
  279. print('URLError = ' + str(e.reason))
  280. except httplib.HTTPException, e:
  281. print('HTTPException')
  282. except Exception:
  283. import traceback
  284. print('generic exception: ' + traceback.format_exc())
  285. #else:
  286. #profilePageDetails = profilePage.read()
  287.  
  288. sys.exit(1)
  289.  
  290.  
  291.  
  292. except:
  293. #print('generic exception: ')
  294. import traceback
  295. print('generic exception: ' + traceback.format_exc())
  296. #+sys.exc_info()[0])
  297.  
  298.  
  299.  
  300. cwd = os.getcwd()
  301. print("current directory is:"+cwd)
  302. # Now change the directory
  303. if(isRunningOnServer):
  304. os.chdir( path )
  305.  
  306.  
  307. parseGResults(firstQueryString)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement