Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # remove any instance of a url (the below code doesn't work)
- #
- #import re
- #
- #URL_LIST = []
- #infile = open('starter_pack_2','r').readlines()
- #outfile = open('output','w+')
- #
- #get = re.compile('HTTP:\/\/\S*',re.IGNORECASE|re.DOTALL)
- #
- #for line in infile:
- # try:
- # got = get.search(line)
- # URL = got.group()
- # URL_LIST.append(URL)
- # except Exception,e:print e
- #
- #
- #
- #for line in infile:
- # for URL in URL_LIST:
- # if URL in line:
- # outfile.write(line.replace(URL,""))
- # break
- # else:
- # outfile.write(line)
- # break
- #
- #outfile.close()
- #
- #
- ############## and after asking in stackoverflow ################
- ##
- #basically your code should look like:
- #
- #with open(...) as infile:
- # for line in infile:
- # # do a regex substitution to remove the URL
- #@jmunsch pastebin.com/0qgTHvDS
- #I did not test it and it's 4am code
- #but basically the idea is there
- #you parse the file **once**, you substitute every URL given the URL regex, stolen from here: stackoverflow.com/questions/…
- #and voila \o/
- #and do not do infile = open('starter_pack_2','r').readlines() and then for line in infile
- #but:
- #
- #with open(...) as infile:
- #for line in infile:
- #
- #then you should use context managers, i.e. with open() as
- #
- #
- #well, your url regex is wrong
- #you should look 'url regex' in SO
- ##
- ## http://stackoverflow.com/users/1290438/zmo
- # remove any instance of a url
- ##########################################################################3
- import re
- URL_LIST = []
- infile = open('starter_pack_2','r')
- outfile = open('output','w+')
- get = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.IGNORECASE|re.DOTALL)
- for line in infile:
- outfile.write(get.sub('', line))
- outfile.close()
Advertisement
Add Comment
Please, Sign In to add comment