Advertisement
Guest User

Look ma, no exceptions

a guest
Jan 28th, 2019
152
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.15 KB | None | 0 0
  1. # Compare sitemaps by turning <url>s flat.
  2. # Dumb text manipulation.
  3. #
  4. # Since this is a throwaway script, I indulge in some crude FP stuff here.
  5. # and_then = fmap; boolean short-circuiting  `and` is the sequence operation.
  6.  
  7. import collections
  8. import os
  9. import re
  10.  
  11. from lxml import etree
  12.  
  13.  
  14. class Right(object):
  15.     __slots__ = ['value']
  16.     def __init__(self, value):
  17.         self.value = value
  18.  
  19.     def __len__(self): return 1  # bool(self) -> True
  20.  
  21.     def bind(self, func): return func(self.value)
  22.  
  23.     def and_then(self, func): return Right(func(self.value))
  24.  
  25.     def and_return(self, value): return Right(value)
  26.  
  27.     def __repr__(self): return 'Right(%s)' % repr(self.value)
  28.  
  29.  
  30. class Left(object):
  31.     __slots__ = ['value']
  32.     def __init__(self, value):
  33.         self.value = value
  34.  
  35.     def __len__(self): return 0  # bool(self) -> False
  36.  
  37.     def and_then(self, *args): return self
  38.  
  39.     and_return = bind = and_then = lambda self, *ignored_args: self
  40.  
  41.     def __repr__(self): return 'Left(%s)' % repr(self.value)
  42.  
  43.  
  44. def from_equality(value, expected):
  45.     if value == expected:
  46.         return Right(value)
  47.     return  Left((value, expected))
  48.  
  49.  
  50. def from_value(value):
  51.     return (Left if value is None else Right)(value)
  52.  
  53.  
  54. def read_expecting(input_stream, prefix):
  55.     line = input_stream.readline().strip()
  56.     return (Right if line.startswith(prefix) else Left)(line)
  57.  
  58.  
  59. def copy_expecting(input_stream, output_stream, prefix):
  60.     return read_expecting(input_stream, prefix).and_then(output_stream.write)
  61.  
  62.  
  63. def flatten_url(input_stream, output_stream):
  64.     read = lambda prefix: read_expecting(input_stream, prefix)
  65.     copy = lambda prefix: copy_expecting(input_stream, output_stream, prefix)
  66.     return (read('<url>') and
  67.             copy('<loc>') and
  68.             copy('<lastmod>') and
  69.             copy('<changefreq>') and
  70.             copy('<priority>') and
  71.             read('</url>')).and_return('\n').and_then(output_stream.write)
  72.  
  73.  
  74. def flatten_url_without_lastmod(input_stream, output_stream):
  75.     read = lambda prefix: read_expecting(input_stream, prefix)
  76.     copy = lambda prefix: copy_expecting(input_stream, output_stream, prefix)
  77.     return (read('<url>') and
  78.             copy('<loc>') and
  79.             read('<lastmod>') and  # Skip lastmod.
  80.             copy('<changefreq>') and
  81.             copy('<priority>') and
  82.             read('</url>').and_return('\n').and_then(output_stream.write))
  83.  
  84.  
  85. def read_eof(input_stream):
  86.     return from_equality(input_stream.readline(), '')  # No \n -> EOF.
  87.  
  88.  
  89. def repeat_while_right(func, *args):
  90.     while True:  # A tailrec version would be too hard in Python.
  91.         result = func(*args)
  92.         if not result:
  93.             return Right(result.value)  # result is Left here, but we want success.
  94.  
  95.  
  96. def flatten_file(input_stream, output_stream, url_flattener):
  97.     read = lambda prefix: read_expecting(input_stream, prefix)
  98.     got_urlset_end = lambda line: from_equality(line, '</urlset>')
  99.     return (
  100.         read("<?xml") and
  101.         read('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">') and
  102.         repeat_while_right(url_flattener, input_stream, output_stream).bind(
  103.             got_urlset_end
  104.         ) and
  105.         read_eof(input_stream).and_return('Completed successfully')
  106.     )
  107.  
  108.  
  109. def run_file_lastmod(in_name, out_name):
  110.     with open(in_name) as input_stream, open(out_name, 'w') as output_stream:
  111.         return flatten_file(input_stream, output_stream, flatten_url)
  112.  
  113.  
  114. def run_file_no_lastmod(in_name, out_name):
  115.     with open(in_name) as input_stream, open(out_name, 'w') as output_stream:
  116.         return flatten_file(input_stream, output_stream, flatten_url_without_lastmod)
  117.  
  118.  
  119. XMLNS = {'NS': 'http://www.sitemaps.org/schemas/sitemap/0.9'}  # Per Google spec.
  120.  
  121.  
  122. def run_sitemap(sitemap_path, out_name, use_lastmod=True):
  123.     root = etree.parse(os.path.join(sitemap_path, 'sitemap.xml')).getroot()
  124.     result = Right('Just started')
  125.     flattener = flatten_url if use_lastmod else flatten_url_without_lastmod
  126.     with open(out_name, 'w') as output_stream:
  127.         for loc in root.findall('NS:sitemap/NS:loc', XMLNS):
  128.             part_name = os.path.join(sitemap_path, os.path.basename(loc.text))
  129.             print ('%20s\r' % part_name),  # We're not pure anyway.
  130.             with open(part_name) as input_stream:
  131.                 result = result and flatten_file(input_stream, output_stream, flattener)
  132.     print
  133.     return result.and_then(lambda _: 'Wrote %s successfully' % out_name)
  134.  
  135.  
  136.  
  137. def extract_diff(diff_fname):
  138.     RX_TARGET = re.compile(r'^([<>]).*/(\d+-[^/]+)/menu/')
  139.     RX_OTHER = re.compile(r'^([<>]) <loc>(.*)(?!/menu/)</loc>')
  140.     result = collections.defaultdict(collections.Counter)
  141.     with open(diff_fname) as input_stream:
  142.         for line in input_stream:
  143.             (from_value(RX_TARGET.search(line)).and_then(
  144.                 lambda match: ('restaurant',) + match.groups()) or
  145.             from_value(RX_OTHER.search(line)).and_then(
  146.                 lambda match: ('other',) + match.groups())).bind(
  147.                     lambda (kind, direction, url): result[direction + kind].update((url,))
  148.                 )
  149.     return result
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement