SHARE
TWEET

Untitled

a guest May 25th, 2019 79 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # coding: utf-8
  2.  
  3. import sys
  4. import random
  5. import operator
  6. import re
  7. # you may add imports if needed (and if they are installed)
  8.  
  9. def parse_url(url):
  10.     stripped = url[7:len(url):1]
  11.     params = {}
  12.     single_params = set()
  13.     need, _, _ = stripped.partition("#")
  14.     raw_segments, _, query = need.partition("?")
  15.     if query != "":
  16.         raws = query.split('&')
  17.         for raw in raws:
  18.             name, templ, value = raw.partition("=")
  19.             if templ != "":
  20.                 params[name] = value
  21.             else:
  22.                 single_params.add(name)
  23.     segments = raw_segments.split('/')
  24.     segments.pop(0)
  25.     return segments, params, single_params
  26.  
  27. def increase(result, key):
  28.     if key in result:
  29.         result[key] += 1
  30.     else:
  31.         result[key] = 1
  32.  
  33. def test_parsed(parsed, result):
  34.     segments, params, single_params = parsed
  35.     length = len(segments)
  36.     if segments[-1] == "":
  37.         length -= 1
  38.     increase(result, "segments:{}".format(length))
  39.     for single in single_params:
  40.         increase(result, "param_name:{}".format(single))
  41.     for key, value in params.items():
  42.         increase(result, "param:{}={}".format(key, value))
  43.         increase(result, "param_name:{}".format(key))
  44.     for ind, segment in enumerate(segments):
  45.         if segment != "":
  46.             increase(result, "segment_name_{}:{}".format(ind, segment))
  47.             if segment.isdigit():
  48.                 increase(result, "segment_[0-9]_{}:1".format(ind))
  49.             increase(result, "segment_len_{}:{}".format(ind, len(segment)))
  50.             prefix, templ, ext = segment.rpartition(".")
  51.             if templ != "" and ext != "" and ext.isalnum():
  52.                 increase(result, "segment_ext_{}:{}".format(ind, ext))
  53.                 if re.match('\D+\d+\D+$', prefix):
  54.                     increase(result, "segment_ext_substr[0-9]_{}:{}".format(ind, ext))
  55.             if re.match('\D+\d+\D+', segment):
  56.                 increase(result, "segment_substr[0-9]_{}:1".format(ind))
  57.  
  58. def extract_features(INPUT_FILE_1, INPUT_FILE_2, OUTPUT_FILE):
  59.     examined = open(INPUT_FILE_1, 'r')
  60.     general = open(INPUT_FILE_2, 'r')
  61.     sample1 = random.sample([url for url in general], 1000)
  62.     sample2 = random.sample([url for url in examined], 1000)
  63.     examined.close()
  64.     general.close()
  65.     sample = sample1 + sample2
  66.     result = {}
  67.     for url in sample:
  68.         good_url = url
  69.         if url[-1] == '\n':
  70.             good_url = url[0:len(url) - 1]
  71.         parsed = parse_url(good_url)
  72.         test_parsed(parsed, result)
  73.     writer = open(OUTPUT_FILE, 'w')
  74.     answer = filter(lambda (key, value): value >= 100, result.items())
  75.     for key, number in sorted(answer, key=operator.itemgetter(1), reverse=True):
  76.         writer.write("{}\t{}\n".format(key, number))
  77.     writer.close()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top