Advertisement
Guest User

Untitled

a guest
Jan 22nd, 2017
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.72 KB | None | 0 0
  1. import numpy as np
  2. import re
  3. import pandas as pd
  4.  
  5. def domain2regexp(line, neg=False):
  6.     splt = re.sub('^https?://(?:m\.|www\.)?|/$|\.[^/]{4}$', '', line).split('/', 1)
  7.     domain, excess = splt if len(splt) > 1 else (splt[0], '')
  8.     new_line = domain + ' @ ^https?://(www\.)?([a-z\-_]+\.)?' + re.escape(domain)
  9.     if neg:
  10.         excess_new = excess.split('/')
  11.         new_line += u'/'    + '/'.join(excess_new[:-1]) + '/(?!' + excess_new[-1] + ')'
  12.     else:
  13.         new_line += u'/'    + excess
  14.     return new_line + '.*$'
  15.  
  16. mas = np.array([['http://www.westmister.eu/collection/verhnyaya-odezhda', True]
  17.                ],
  18.                dtype='O')
  19.  
  20. for i in mas:
  21.     reg = domain2regexp(i[0], i[1])
  22.     print(reg)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement