Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import os
- # This greps all the external urls in the posts.
- HTTP_REGEX = re.compile(r'https?:[^<>"%\\]*?\.(?:png|jpg|bmp|jpeg|gif|mp3|mp4|swf)')
- if __name__ == "__main__":
- blog_names = next(os.walk('./'))[1]
- for blog in blog_names:
- os.chdir(blog)
- media_files = os.listdir('./media')
- archive_files = os.listdir('./archive')
- posts_files = os.listdir('./posts')
- for archive_file in archive_files:
- html_file = open('./archive/' + archive_file, 'r')
- html_str = html_file.read()
- html_file.close()
- url_list = HTTP_REGEX.findall(html_str)
- matched_url_list = []
- for url in url_list:
- filename = os.path.basename(url)
- if filename in media_files:
- matched_url_list.append(url)
- html_str = html_str.replace(url, '../media/' + filename)
- new_html_file = open('./archive/' + archive_file, 'w')
- new_html_file.write(html_str)
- for posts_file in posts_files:
- html_file = open('./posts/' + posts_file, 'r')
- html_str = html_file.read()
- html_file.close()
- url_list = HTTP_REGEX.findall(html_str)
- matched_url_list = []
- for url in url_list:
- filename = os.path.basename(url)
- if filename in media_files:
- matched_url_list.append(url)
- html_str = html_str.replace(url, '../media/' + filename)
- new_html_file = open('./posts/' + posts_file, 'w')
- new_html_file.write(html_str)
- html_file = open('./index.html', 'r')
- html_str = html_file.read()
- html_file.close()
- url_list = HTTP_REGEX.findall(html_str)
- matched_url_list = []
- for url in url_list:
- filename = os.path.basename(url)
- if filename in media_files:
- matched_url_list.append(url)
- html_str = html_str.replace(url, './media/' + filename)
- new_html_file = open('./index.html', 'w')
- new_html_file.write(html_str)
- os.chdir('../')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement