SHARE
TWEET

Look ma, no exceptions

a guest Jan 28th, 2019 92 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # Compare sitemaps by turning <url>s flat.
  2. # Dumb text manipulation.
  3. #
  4. # Since this is a throwaway script, I indulge in some crude FP stuff here.
  5. # and_then = fmap; boolean short-circuiting  `and` is the sequence operation.
  6.  
  7. import collections
  8. import os
  9. import re
  10.  
  11. from lxml import etree
  12.  
  13.  
  14. class Right(object):
  15.     __slots__ = ['value']
  16.     def __init__(self, value):
  17.         self.value = value
  18.  
  19.     def __len__(self): return 1  # bool(self) -> True
  20.  
  21.     def bind(self, func): return func(self.value)
  22.  
  23.     def and_then(self, func): return Right(func(self.value))
  24.  
  25.     def and_return(self, value): return Right(value)
  26.  
  27.     def __repr__(self): return 'Right(%s)' % repr(self.value)
  28.  
  29.  
  30. class Left(object):
  31.     __slots__ = ['value']
  32.     def __init__(self, value):
  33.         self.value = value
  34.  
  35.     def __len__(self): return 0  # bool(self) -> False
  36.  
  37.     def and_then(self, *args): return self
  38.  
  39.     and_return = bind = and_then = lambda self, *ignored_args: self
  40.  
  41.     def __repr__(self): return 'Left(%s)' % repr(self.value)
  42.  
  43.  
  44. def from_equality(value, expected):
  45.     if value == expected:
  46.         return Right(value)
  47.     return  Left((value, expected))
  48.  
  49.  
  50. def from_value(value):
  51.     return (Left if value is None else Right)(value)
  52.  
  53.  
  54. def read_expecting(input_stream, prefix):
  55.     line = input_stream.readline().strip()
  56.     return (Right if line.startswith(prefix) else Left)(line)
  57.  
  58.  
  59. def copy_expecting(input_stream, output_stream, prefix):
  60.     return read_expecting(input_stream, prefix).and_then(output_stream.write)
  61.  
  62.  
  63. def flatten_url(input_stream, output_stream):
  64.     read = lambda prefix: read_expecting(input_stream, prefix)
  65.     copy = lambda prefix: copy_expecting(input_stream, output_stream, prefix)
  66.     return (read('<url>') and
  67.             copy('<loc>') and
  68.             copy('<lastmod>') and
  69.             copy('<changefreq>') and
  70.             copy('<priority>') and
  71.             read('</url>')).and_return('\n').and_then(output_stream.write)
  72.  
  73.  
  74. def flatten_url_without_lastmod(input_stream, output_stream):
  75.     read = lambda prefix: read_expecting(input_stream, prefix)
  76.     copy = lambda prefix: copy_expecting(input_stream, output_stream, prefix)
  77.     return (read('<url>') and
  78.             copy('<loc>') and
  79.             read('<lastmod>') and  # Skip lastmod.
  80.             copy('<changefreq>') and
  81.             copy('<priority>') and
  82.             read('</url>').and_return('\n').and_then(output_stream.write))
  83.  
  84.  
  85. def read_eof(input_stream):
  86.     return from_equality(input_stream.readline(), '')  # No \n -> EOF.
  87.  
  88.  
  89. def repeat_while_right(func, *args):
  90.     while True:  # A tailrec version would be too hard in Python.
  91.         result = func(*args)
  92.         if not result:
  93.             return Right(result.value)  # result is Left here, but we want success.
  94.  
  95.  
  96. def flatten_file(input_stream, output_stream, url_flattener):
  97.     read = lambda prefix: read_expecting(input_stream, prefix)
  98.     got_urlset_end = lambda line: from_equality(line, '</urlset>')
  99.     return (
  100.         read("<?xml") and
  101.         read('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">') and
  102.         repeat_while_right(url_flattener, input_stream, output_stream).bind(
  103.             got_urlset_end
  104.         ) and
  105.         read_eof(input_stream).and_return('Completed successfully')
  106.     )
  107.  
  108.  
  109. def run_file_lastmod(in_name, out_name):
  110.     with open(in_name) as input_stream, open(out_name, 'w') as output_stream:
  111.         return flatten_file(input_stream, output_stream, flatten_url)
  112.  
  113.  
  114. def run_file_no_lastmod(in_name, out_name):
  115.     with open(in_name) as input_stream, open(out_name, 'w') as output_stream:
  116.         return flatten_file(input_stream, output_stream, flatten_url_without_lastmod)
  117.  
  118.  
  119. XMLNS = {'NS': 'http://www.sitemaps.org/schemas/sitemap/0.9'}  # Per Google spec.
  120.  
  121.  
  122. def run_sitemap(sitemap_path, out_name, use_lastmod=True):
  123.     root = etree.parse(os.path.join(sitemap_path, 'sitemap.xml')).getroot()
  124.     result = Right('Just started')
  125.     flattener = flatten_url if use_lastmod else flatten_url_without_lastmod
  126.     with open(out_name, 'w') as output_stream:
  127.         for loc in root.findall('NS:sitemap/NS:loc', XMLNS):
  128.             part_name = os.path.join(sitemap_path, os.path.basename(loc.text))
  129.             print ('%20s\r' % part_name),  # We're not pure anyway.
  130.             with open(part_name) as input_stream:
  131.                 result = result and flatten_file(input_stream, output_stream, flattener)
  132.     print
  133.     return result.and_then(lambda _: 'Wrote %s successfully' % out_name)
  134.  
  135.  
  136.  
  137. def extract_diff(diff_fname):
  138.     RX_TARGET = re.compile(r'^([<>]).*/(\d+-[^/]+)/menu/')
  139.     RX_OTHER = re.compile(r'^([<>]) <loc>(.*)(?!/menu/)</loc>')
  140.     result = collections.defaultdict(collections.Counter)
  141.     with open(diff_fname) as input_stream:
  142.         for line in input_stream:
  143.             (from_value(RX_TARGET.search(line)).and_then(
  144.                 lambda match: ('restaurant',) + match.groups()) or
  145.             from_value(RX_OTHER.search(line)).and_then(
  146.                 lambda match: ('other',) + match.groups())).bind(
  147.                     lambda (kind, direction, url): result[direction + kind].update((url,))
  148.                 )
  149.     return result
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top