Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- import sys
- import random
- import operator
- import re
- # you may add imports if needed (and if they are installed)
- def parse_url(url):
- stripped = url[7:len(url):1]
- params = {}
- single_params = set()
- need, _, _ = stripped.partition("#")
- raw_segments, _, query = need.partition("?")
- if query != "":
- raws = query.split('&')
- for raw in raws:
- name, templ, value = raw.partition("=")
- if templ != "":
- params[name] = value
- else:
- single_params.add(name)
- segments = raw_segments.split('/')
- segments.pop(0)
- return segments, params, single_params
- def increase(result, key):
- if key in result:
- result[key] += 1
- else:
- result[key] = 1
- def test_parsed(parsed, result):
- segments, params, single_params = parsed
- length = len(segments)
- if segments[-1] == "":
- length -= 1
- increase(result, "segments:{}".format(length))
- for single in single_params:
- increase(result, "param_name:{}".format(single))
- for key, value in params.items():
- increase(result, "param:{}={}".format(key, value))
- increase(result, "param_name:{}".format(key))
- for ind, segment in enumerate(segments):
- if segment != "":
- increase(result, "segment_name_{}:{}".format(ind, segment))
- if segment.isdigit():
- increase(result, "segment_[0-9]_{}:1".format(ind))
- increase(result, "segment_len_{}:{}".format(ind, len(segment)))
- prefix, templ, ext = segment.rpartition(".")
- if templ != "" and ext != "" and ext.isalnum():
- increase(result, "segment_ext_{}:{}".format(ind, ext))
- if re.match('\D+\d+\D+$', prefix):
- increase(result, "segment_ext_substr[0-9]_{}:{}".format(ind, ext))
- if re.match('\D+\d+\D+', segment):
- increase(result, "segment_substr[0-9]_{}:1".format(ind))
- def extract_features(INPUT_FILE_1, INPUT_FILE_2, OUTPUT_FILE):
- examined = open(INPUT_FILE_1, 'r')
- general = open(INPUT_FILE_2, 'r')
- sample1 = random.sample([url for url in general], 1000)
- sample2 = random.sample([url for url in examined], 1000)
- examined.close()
- general.close()
- sample = sample1 + sample2
- result = {}
- for url in sample:
- good_url = url
- if url[-1] == '\n':
- good_url = url[0:len(url) - 1]
- parsed = parse_url(good_url)
- test_parsed(parsed, result)
- writer = open(OUTPUT_FILE, 'w')
- answer = filter(lambda (key, value): value >= 100, result.items())
- for key, number in sorted(answer, key=operator.itemgetter(1), reverse=True):
- writer.write("{}\t{}\n".format(key, number))
- writer.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement