Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # encoding: utf-8
- """
- hashtaglist.py
- Created by Gordon Bonnar on 2010-03-14.
- Thanks to dlitz for the code replicating cat file|sort|uniq -c|sort -rn
- (NOT WORKING)
- """
- import twitter
- import re
- import sys
- import os.path
- #Configuration Variables
- #=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
- username = "XXXXXXXX"
- password = "XXXXXXXXX"
- #User from which to collect hashtag data
- user = "XXXXXXX"
- #Define path to logfiles
- path = "XXXXXXXXXXX/"
- filename=path + user + "_hashtaglist.txt"
- #End Configuration section
- #=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
- def get_twitter_api_cxn(user, passwd):
- apicxn = twitter.Api(username=user, password=passwd)
- return apicxn
- def print_from_file(file):
- filename = file
- print filename
- # Build a dictionary of lines and their associated counts.
- counts = {}
- input_file = open(filename, "r")
- print input_file.readline()
- for line in input_file:
- line = line.rstrip("\n").rstrip("\r") # Strip trailing LF/CRLF
- print line
- counts[line] = counts.get(line, 0) + 1
- print counts[line]
- # Build a list of [(lineA, countA), (lineB, countB), ... ]
- sorted_counts = list(counts.items())
- # Sort the list by (count, line) in reverse order.
- sorted_counts.sort(lambda a,b: -cmp((a[1], a[0]), (b[1], b[0])))
- # Output the lines
- for line, count in sorted_counts:
- print "%7d %s" % (count, line)
- #-------
- def get_hashtaglist(apicxn, user, file, start_id, count):
- api = apicxn
- filename = file
- #If a file already exists, grab the first line (StatusID) and open for
- #reading and appending, otherwise create file and open for writing.
- finalstatus = get_final_status_id(filename)
- try:
- finalstatus
- except NameError:
- finalstatus = None
- if finalstatus is None:
- FILE = open(filename, "w")
- print filename
- else:
- if os.path.exists(filename):
- FILE = open(filename, "r+a")
- #If we don't know the last statusID checked, then grab the last 200 tweets, if we do,
- #grab from last known ID.
- if finalstatus is None:
- statuses = api.GetUserTimeline(user, count)
- print statuses[0].id
- FILE.write(str(statuses[0].id) + "\n")
- else:
- statuses = api.GetUserTimeline(user,count, finalstatus)
- #Reverse the list of statuses so it is in chronological order
- statuses.reverse()
- #Compile regex for matching hastags
- p = re.compile('[\A ](#\w+)',re.UNICODE)
- #Initialise list for hashtags
- hashtaglist=[]
- #For every status in retrieved statuses, if it contains a hashtag add to hashtag list
- for s in statuses :
- matches = re.findall(p,s.text)
- for match in matches :
- hashtaglist.append(match)
- return(hashtaglist)
- def get_final_status_id(file):
- if os.path.exists(filename):
- FILE = open(filename, "r")
- status_id = FILE.readline()
- FILE.close()
- else:
- status_id = None
- return status_id
- def write_hashtaglist(file,hashtaglist):
- filename = file
- hashthaglist = hashtaglist
- FILE = open(filename, "w")
- #Write the hashtags to file
- for hashtag in hashtaglist :
- #Write hashtags to file
- FILE.write(hashtag + "\n")
- FILE.close()
- def main():
- api = get_twitter_api_cxn(username, password)
- finalstatus = get_final_status_id(filename)
- hashtaglist = get_hashtaglist(api, user, filename, finalstatus, 200)
- write_hashtaglist(filename, hashtaglist)
- print_from_file(filename)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement