Advertisement
Guest User

transform_content.py

a guest
Sep 19th, 2014
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.63 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # Copyright 2008 Brett Slatkin
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15.  
  16. __author__ = "Brett Slatkin (bslatkin@gmail.com)"
  17.  
  18. import os
  19. import re
  20. import urlparse
  21.  
  22. ################################################################################
  23.  
  24. # URLs that have absolute addresses
  25. ABSOLUTE_URL_REGEX = r"(http(s?):)?//(?P<url>[^\"'> \t\)]+)"
  26.  
  27. # URLs that are relative to the base of the current hostname.
  28. BASE_RELATIVE_URL_REGEX = r"/(?!(/)|(http(s?)://)|(url\())(?P<url>[^\"'> \t\)]*)"
  29.  
  30. # URLs that have '../' or './' to start off their paths.
  31. TRAVERSAL_URL_REGEX = r"(?P<relative>\.(\.)?)/(?!(/)|(http(s?)://)|(url\())(?P<url>[^\"'> \t\)]*)"
  32.  
  33. # URLs that are in the same directory as the requested URL.
  34. SAME_DIR_URL_REGEX = r"(?!(/)|(http(s?)://)|(url\())(?P<url>[^\"'> \t\)]+)"
  35.  
  36. # URL matches the root directory.
  37. ROOT_DIR_URL_REGEX = r"(?!//(?!>))/(?P<url>)(?=[ \t\n]*[\"'\)>/])"
  38.  
  39. # Start of a tag using 'src' or 'href'
  40. TAG_START = r"(?i)\b(?P<tag>src|href|action|url|background)(?P<equals>[\t ]*=[\t ]*)(?P<quote>[\"']?)"
  41.  
  42. # Start of a CSS import
  43. CSS_IMPORT_START = r"(?i)@import(?P<spacing>[\t ]+)(?P<quote>[\"']?)"
  44.  
  45. # CSS url() call
  46. CSS_URL_START = r"(?i)\burl\((?P<quote>[\"']?)"
  47.  
  48. REPLACEMENT_REGEXES = [
  49. (TAG_START + SAME_DIR_URL_REGEX,
  50. "\g<tag>\g<equals>\g<quote>%(accessed_dir)s\g<url>"),
  51.  
  52. (TAG_START + TRAVERSAL_URL_REGEX,
  53. "\g<tag>\g<equals>\g<quote>%(accessed_dir)s/\g<relative>/\g<url>"),
  54.  
  55. (TAG_START + BASE_RELATIVE_URL_REGEX,
  56. "\g<tag>\g<equals>\g<quote>/%(base)s/\g<url>"),
  57.  
  58. (TAG_START + ROOT_DIR_URL_REGEX,
  59. "\g<tag>\g<equals>\g<quote>/%(base)s/"),
  60.  
  61. # Need this because HTML tags could end with '/>', which confuses the
  62. # tag-matching regex above, since that's the end-of-match signal.
  63. (TAG_START + ABSOLUTE_URL_REGEX,
  64. "\g<tag>\g<equals>\g<quote>/\g<url>"),
  65.  
  66. (CSS_IMPORT_START + SAME_DIR_URL_REGEX,
  67. "@import\g<spacing>\g<quote>%(accessed_dir)s\g<url>"),
  68.  
  69. (CSS_IMPORT_START + TRAVERSAL_URL_REGEX,
  70. "@import\g<spacing>\g<quote>%(accessed_dir)s/\g<relative>/\g<url>"),
  71.  
  72. (CSS_IMPORT_START + BASE_RELATIVE_URL_REGEX,
  73. "@import\g<spacing>\g<quote>/%(base)s/\g<url>"),
  74.  
  75. (CSS_IMPORT_START + ABSOLUTE_URL_REGEX,
  76. "@import\g<spacing>\g<quote>/\g<url>"),
  77.  
  78. (CSS_URL_START + SAME_DIR_URL_REGEX,
  79. "url(\g<quote>%(accessed_dir)s\g<url>"),
  80.  
  81. (CSS_URL_START + TRAVERSAL_URL_REGEX,
  82. "url(\g<quote>%(accessed_dir)s/\g<relative>/\g<url>"),
  83.  
  84. (CSS_URL_START + BASE_RELATIVE_URL_REGEX,
  85. "url(\g<quote>/%(base)s/\g<url>"),
  86.  
  87. (CSS_URL_START + ABSOLUTE_URL_REGEX,
  88. "url(\g<quote>/\g<url>"),
  89. ]
  90.  
  91. ################################################################################
  92.  
  93. def TransformContent(base_url, accessed_url, content):
  94. url_obj = urlparse.urlparse(accessed_url)
  95. accessed_dir = os.path.dirname(url_obj.path)
  96. if not accessed_dir.endswith("/"):
  97. accessed_dir += "/"
  98.  
  99. for pattern, replacement in REPLACEMENT_REGEXES:
  100. fixed_replacement = replacement % {
  101. "base": base_url,
  102. "accessed_dir": accessed_dir,
  103. }
  104. content = re.sub(pattern, fixed_replacement, content)
  105. return content
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement