Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # TagGen.py
- # Process a list of URLs, get tags for each from a web service and create a bag of words
- # Arguments:
- # 1. file containing URLs to process (1 per line)
- # 2. file to write list to as a comma seperated string
- # usage:
- # python TagGen.py urls.txt tagsfile.txt
- import fileinput
- import simplejson as json
- import sys
- import urllib2
- # Read in a list of URLs (one one each line) and create a list to process
- # @return <list> of urls
- def getURLs(fileName):
- L = []
- for line in fileinput.input(fileName):
- L.append(line)
- print "%s urls to check" % len(L)
- return L
- # Query the TagTheNet api and get Tags
- # OpenCalais is also nice, enjoying the simplicity of this
- # could create different tag processing functions for different APIs and set as an option
- # @return <list> of tags
- def getTags(destURL):
- u = 'http://tagthe.net/api/?view=json&url=%s' % destURL
- print "Checking: %s" % destURL
- try:
- f = urllib2.urlopen(u)
- jsonString = f.read()
- except:
- print "FAIL: URL not loaded/read"
- pass
- try:
- o = json.loads(jsonString)
- except:
- print "FAIL: json not loaded"
- pass
- try:
- tags = o['memes'][0]['dimensions']['topic']
- print "Found some tags : %s" % ", ".join(tags)
- return tags
- except:
- print "FAIL: No tags returned from service"
- return []
- def go(urlFile):
- T = []
- L = getURLs(urlFile);
- for u in L:
- s = getTags(u);
- for item in s:
- if item != "nullItem":
- T.append(unicode(item).encode("utf-8"))
- return T
- if len(sys.argv) < 2:
- sys.stderr.write('Usage: sys.argv[0] ')
- sys.exit(1)
- else:
- myTags = go(sys.argv[1])
- outfile = open(sys.argv[2], 'w')
- outfile.write(", ".join(myTags))
- outfile.close()
- print "%s tags created!" % len(myTags)
Advertisement
Add Comment
Please, Sign In to add comment