Advertisement
Guest User

url_cleanup.py

a guest
Dec 10th, 2018
232
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.21 KB | None | 0 0
  1. import re
  2. import os
  3.  
  4. # This greps all the external urls in the posts.
  5. HTTP_REGEX = re.compile(r'https?:[^<>"%\\]*?\.(?:png|jpg|bmp|jpeg|gif|mp3|mp4|swf)')
  6.  
  7. if __name__ == "__main__":
  8. blog_names = next(os.walk('./'))[1]
  9. for blog in blog_names:
  10. os.chdir(blog)
  11. media_files = os.listdir('./media')
  12. archive_files = os.listdir('./archive')
  13. posts_files = os.listdir('./posts')
  14.  
  15. for archive_file in archive_files:
  16. html_file = open('./archive/' + archive_file, 'r')
  17. html_str = html_file.read()
  18. html_file.close()
  19. url_list = HTTP_REGEX.findall(html_str)
  20.  
  21. matched_url_list = []
  22. for url in url_list:
  23. filename = os.path.basename(url)
  24. if filename in media_files:
  25. matched_url_list.append(url)
  26. html_str = html_str.replace(url, '../media/' + filename)
  27. new_html_file = open('./archive/' + archive_file, 'w')
  28. new_html_file.write(html_str)
  29.  
  30. for posts_file in posts_files:
  31. html_file = open('./posts/' + posts_file, 'r')
  32. html_str = html_file.read()
  33. html_file.close()
  34. url_list = HTTP_REGEX.findall(html_str)
  35.  
  36. matched_url_list = []
  37. for url in url_list:
  38. filename = os.path.basename(url)
  39. if filename in media_files:
  40. matched_url_list.append(url)
  41. html_str = html_str.replace(url, '../media/' + filename)
  42. new_html_file = open('./posts/' + posts_file, 'w')
  43. new_html_file.write(html_str)
  44.  
  45.  
  46. html_file = open('./index.html', 'r')
  47. html_str = html_file.read()
  48. html_file.close()
  49. url_list = HTTP_REGEX.findall(html_str)
  50.  
  51. matched_url_list = []
  52. for url in url_list:
  53. filename = os.path.basename(url)
  54. if filename in media_files:
  55. matched_url_list.append(url)
  56. html_str = html_str.replace(url, './media/' + filename)
  57. new_html_file = open('./index.html', 'w')
  58. new_html_file.write(html_str)
  59.  
  60. os.chdir('../')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement