jmunsch

Remove URLs from text file python

Feb 22nd, 2014
449
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.85 KB | None | 0 0
  1. # remove any instance of a url (the below code doesn't work)
  2. #
  3. #import re
  4. #
  5. #URL_LIST = []
  6. #infile = open('starter_pack_2','r').readlines()
  7. #outfile = open('output','w+')
  8. #
  9. #get = re.compile('HTTP:\/\/\S*',re.IGNORECASE|re.DOTALL)
  10. #
  11. #for line in infile:
  12. #    try:
  13. #        got = get.search(line)
  14. #        URL = got.group()
  15. #        URL_LIST.append(URL)
  16. #    except Exception,e:print e
  17. #                
  18. #
  19. #
  20. #for line in infile:
  21. #     for URL in URL_LIST:
  22. #         if URL in line:
  23. #             outfile.write(line.replace(URL,""))
  24. #             break
  25. #     else:
  26. #         outfile.write(line)
  27. #         break
  28. #
  29. #outfile.close()
  30. #
  31. #
  32. ############## and after asking in stackoverflow ################
  33. ##
  34. #basically your code should look like:
  35. #
  36. #with open(...) as infile:
  37. #    for line in infile:
  38. #         # do a regex substitution to remove the URL
  39. #@jmunsch pastebin.com/0qgTHvDS
  40. #I did not test it and it's 4am code
  41. #but basically the idea is there
  42. #you parse the file **once**, you substitute every URL given the URL regex, stolen from here: stackoverflow.com/questions/…
  43. #and voila \o/
  44.  
  45. #and do not do infile = open('starter_pack_2','r').readlines() and then for line in infile
  46. #but:
  47. #
  48. #with open(...) as infile:
  49. #for line in infile:
  50. #
  51. #then you should use context managers, i.e. with open() as
  52. #
  53. #
  54. #well, your url regex is wrong
  55. #you should look 'url regex' in SO
  56. ##
  57. ## http://stackoverflow.com/users/1290438/zmo
  58. # remove any instance of a url
  59. ##########################################################################3
  60.  
  61. import re
  62.  
  63. URL_LIST = []
  64. infile = open('starter_pack_2','r')
  65. outfile = open('output','w+')
  66.  
  67. get = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.IGNORECASE|re.DOTALL)
  68.  
  69. for line in infile:
  70.     outfile.write(get.sub('', line))
  71.  
  72. outfile.close()
Advertisement
Add Comment
Please, Sign In to add comment