Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import re
- import pandas as pd
- def domain2regexp(line, neg=False):
- splt = re.sub('^https?://(?:m\.|www\.)?|/$|\.[^/]{4}$', '', line).split('/', 1)
- domain, excess = splt if len(splt) > 1 else (splt[0], '')
- new_line = domain + ' @ ^https?://(www\.)?([a-z\-_]+\.)?' + re.escape(domain)
- if neg:
- excess_new = excess.split('/')
- new_line += u'/' + '/'.join(excess_new[:-1]) + '/(?!' + excess_new[-1] + ')'
- else:
- new_line += u'/' + excess
- return new_line + '.*$'
- mas = np.array([['http://www.westmister.eu/collection/verhnyaya-odezhda', True]
- ],
- dtype='O')
- for i in mas:
- reg = domain2regexp(i[0], i[1])
- print(reg)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement