Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re, sys
- import itertools
- import matplotlib.pyplot as plt
- #fname = 'angle_grinder_pages'
- class Debug(object):
- def printq(self, txt = ''):
- print '+'
- if type(txt) is not str:
- for item in txt:
- print item
- else: print txt
- print '!'
- def debug(self, txt):
- print txt
- class FatalError(Exception):
- def __init__(self, msg):
- print msg
- class Plot(object):
- font = { 'family' : 'monospace',
- 'weight' : 'normal',
- 'size' : 10 }
- def plot(self):
- plt.rc('font', **self.font)
- #sort items inplace on price
- Item.items.sort(key=lambda x: float(x.price))
- #Display all items using __str__
- for count, item in enumerate(Item.items):
- try:
- x = getattr(item, Plot.x)
- y = getattr(item, Plot.y)
- except Exception as e:
- title = getattr(item, 'title', '')
- print title, e
- continue
- if Plot.k_on:
- if 'k' in item.color: continue
- if Plot.x_limit != None:
- if x > Plot.x_limit: continue
- if y > Plot.y_limit: continue
- #if x > 30000: print item.item
- print x, y, item.price, item.title, count
- plt.scatter(float(x), float(y), color = item.color, s=50)
- plt.annotate(xy = (float(x), float(y)), s = item.brand[0:1] + str(count))
- plt.show(block=False)
- raw_input('>')
- def __init__(self, item_attributes = ('price', 'power'), k_on = 1, x_limit = 10000, y_limit = 5000):
- Plot.k_on = k_on
- Plot.x_limit = x_limit
- Plot.y_limit = y_limit
- Plot.x = item_attributes[0]; Plot.y = item_attributes[1]
- self.plot()
- #--------------------------------------------------------------------------------
- class Item(FatalError, Debug, object):
- fmt_spec = 'input|output ^\s+.+Rs\..+ ^.+$'
- def x_brand(self):
- #creates two dictonaries and then uses the brand-keys from dict to search the title for a matching
- #known brand
- color_to_brand = { 'g' : ['Bosch', 'Makita', 'Hitachi' ],
- 'm' : ['Walt', 'Maktec', 'Decker', 'Skil', 'Stanley', 'Ralli' ],
- 'r' : ['Metabo', 'Milwaukee', 'Fein', 'Festool' ],
- 'y' : [ 'Dongcheng', 'Cumi', 'JCB', 'Ferm', 'Maf', 'Eastman', 'Yking' ] }
- brand_to_color = dict()
- for color, brands in color_to_brand.items():
- for brand in brands:
- brand_to_color.setdefault(brand, color)
- #locate color for brand
- for brand in brand_to_color.keys():
- m = re.search(brand, self.title, flags=re.I)
- if m:
- self.color = brand_to_color[brand]
- self.brand = brand
- return
- #return nobrand, default color
- self.brand = ''; self.color = 'k'
- def x_title(self, txt):
- #set title of object
- lis_t = re.findall(r'([A-Z].+?)\n. +? by ', txt, re.M|re.S|re.X)
- if len(lis_t):
- self.title = lis_t[0]
- def x_features(self, lis_t, pat):
- if 'Features' not in pat: return
- if len(lis_t) == 0:
- self.features = ''; return
- lines = lis_t.strip().lower().split('\n')
- for line in lines:
- line = line.split(':')
- try:
- key = line[0].strip().strip('\n'); value = line[1].strip().strip('\n')
- old = getattr(self, 'features', ''); value = old + value
- setattr(self, key, value)
- except Exception as e:
- pass
- #print e, lines, self.title,
- def x_description(self, lis_t, pat):
- if 'Description' not in pat: return
- if len(lis_t) == 0:
- self.description = ''; return
- lines = lis_t.split('\n')
- txt = ''
- for line in lines:
- line = line.strip()
- txt += line + ' '
- setattr(self, 'description', txt)
- def x_product(self, lis_t, pat):
- if 'Specifications' not in pat: return
- lines = lis_t.split('\n')
- for line in lines:
- if line is '': continue
- #split line on the first digit-,. into key value pair which are
- #converted to attributes of object
- m = re.match(r'^([^0-9]+)([0-9-,.]+.+)', line)
- if m:
- setattr(self, m.group(1).lower().strip(), m.group(2).strip())
- def x_price(self, lis_t, pat):
- if 'Rs' not in pat: return
- lines = lis_t.split('\n')
- for line in lines:
- if 'DEL' in line:
- if getattr(self, 'price', None): return
- lis_t = re.findall(r'[0-9,]+', line)
- price_del = lis_t[0].replace(',', '')
- #price_del = re.sub(r'[^0-9,]+', '', line)
- self.price = float(price_del)
- elif 'per piece' in line:
- if getattr(self, 'price', None): return
- m = re.search(r'([0-9,]+)', line)
- if m:
- price_per = m.group().replace(',', '')
- self.price = float(price_per)
- elif re.match(r'\s+Rs\s* \. .+?Extra', line, flags=re.X):
- lis_t= re.findall(r'\s+Rs\s*\.\s*([0-9.,]+)', line)
- price = lis_t[0].strip()
- self.price = float(price)
- else:
- pass
- items = []
- def __init__(self, item = []):
- if len(item) == 0: return
- if re.match(r'Drilling Machine', item):
- lis_t = re.findall(r'\s*\[ [0-9]+ ]([A-Z].+?)\n', item, re.X)
- self.title = lis_t[0]
- lis_t = re.findall(r'Rs\.\s+([0-9,]+)', item)
- self.price = float(lis_t[0].replace(',', ''))
- self.price_del = float(lis_t[0].replace(',', ''))
- #add myself to a global list
- Item.items.append(self)
- self.item = item
- self.x_title(item)
- if not getattr(self, 'title', ''):
- raise FatalError('no title for ' + item)
- sys.exit(0)
- pats = [
- r'by.+?\s+ Features\n\s+ (.+?) (?=\[)',
- r'\n\s+Description (.+) (?=Product \s Spec)',
- r'\nProduct \s Specifications\n(.+) (?=\nQuick)',
- r'(Rs.+)'
- ]
- for pat in pats:
- lis_t = re.findall(pat, item, flags = re.X|re.M|re.S)
- if len(lis_t) == 0: continue
- #write your pat so that it always returns exactly one or zero matched object
- if len(lis_t) > 1:
- print len(lis_t), ' ', pat, self.title
- sys.exit(0)
- lines = lis_t[0]
- self.x_features(lines, pat); self.x_description(lines, pat)
- self.x_product(lines, pat); self.x_price(lines, pat)
- self.x_brand()
- def __str__(self):
- return ''
- class ImpactDrill(Item, Plot):
- def x_pwr(self):
- for key, value in self.__dict__.items():
- if any(map( (lambda x: x in key), ('power', 'watt'))):
- tmp = re.sub(r'[^0-9-.]', '', value)
- try:
- self.power = float(tmp.split('-')[0])
- except Exception as e:
- print e, key, value, self.title
- continue
- return
- #corner case solutions if above doesn't work
- m = re.search(r'([0-9]+)\s*W', self.title)
- if m is not None:
- self.power = m.group(1)
- return
- self.power = 0
- def __init__(self, item):
- super(ImpactDrill, self).__init__(item)
- self.x_pwr()
- class AngleGrinder(Item, Plot):
- def x_pwr(self):
- for key, value in self.__dict__.items():
- if any(map( (lambda x: x in key), ('power', 'watt'))):
- tmp = re.sub(r'[^0-9-.]', '', value)
- try:
- self.power = float(tmp.split('-')[0])
- except Exception as e:
- print e, key, value, self.title
- continue
- return
- #corner case solutions if above doesn't work
- m = re.search(r'([0-9]+)\s*W', self.title)
- if m is not None:
- self.power = m.group(1)
- return
- self.power = 0
- def x_dia(self):
- for key, value in self.__dict__.items():
- if any(map( (lambda x: x in key), ('dia', 'Wheel Size', 'Skid Length'))):
- self.dia = value.replace('mm', '').replace('max', '').strip()
- if re.search(r'Inch', value, re.I):
- value = re.sub(r'[^0-9.]', '', value)
- self.dia = float(value) * 25.4
- if '/' in value:
- value = value.split('/')
- self.dia = value[0]
- return
- m = re.search(r'([0-9]+)\s*mm', self.title, flags = re.I)
- if m is not None:
- self.dia = int(m.group(1))
- return
- try:
- m = re.search(r'([0-9.]+)\s*(inch)', self.title, flags = re.I)
- if m is not None:
- self.dia = float(m.group(1) * 25.4)
- return
- except Exception as e:
- print e, m.group(1), self.title
- self.dia = 0
- def __init__(self, item):
- super(AngleGrinder, self).__init__(item)
- self.x_pwr()
- self.x_dia()
- class JigSaw(AngleGrinder, Item, Plot):
- pass
- class Drill(AngleGrinder, Item, Plot):
- pass
- class ChopSaw(Item, Plot):
- pass
- class HotAir(AngleGrinder, Item, Plot):
- pass
- class Saw(AngleGrinder, Item, Plot):
- pass
- #--------------------------------------------------------------------------------
- class IndustryBuying(Plot, Debug, object):
- f_to_cls = { 'jigsaw_pages' : JigSaw, 'angle_grinder_pages' : AngleGrinder, 'impact_drill_pages' : ImpactDrill,
- 'drill_pages' : Drill, 'chopsaw_pages' : ChopSaw, 'saw_pages' : Saw, 'hotair_pages' : HotAir }
- def slurp_file(self, fname):
- fh = open(fname, 'r')
- txt = fh.read()
- return txt
- def get_items_from(self, txt):
- #extract pages from txt
- pages = re.findall(r'Home \[(.+?)^Company', txt, re.M|re.S)
- if not len(pages):
- raise FatalError('no pages')
- #extract item, -- to -- from page, for all pages, and return all items
- items = []
- for count, page in enumerate(pages):
- #item_list = re.findall(r'_______\n+\s+(.+)_______', page, re.M|re.S)
- item_list = re.findall(r'\n([A-Z]+.+?by.+?Rs.+?)(?=ADD TO CART)', page, re.M|re.S)
- if len(item_list):
- for item in item_list:
- items.append(item)
- self.debug(count)
- return items
- def __init__(self, fname = ''):
- #read all our data and break it up into pages, then break up pages into items
- txt = self.slurp_file(fname)
- items = self.get_items_from(txt)
- Cls = self.f_to_cls[fname]
- #build Item objects from item-txt
- for item in items:
- Cls(item)
- #x_limit can be None or you have to set y_limit as well
- Plot.__init__(self, item_attributes = ('price', 'power'), k_on = 0, x_limit = 25000, y_limit = 5000)
- #--------------------------------------------------------------------------------
- if __name__ == '__main__':
- #x = IndustryBuying('drill_pages')
- #x = IndustryBuying('impact_drill_pages')
- x = IndustryBuying('angle_grinder_pages')
- #x = IndustryBuying('saw_pages')
- #x = IndustryBuying('hotair_pages')
- #x = IndustryBuying('jigsaw_pages')
- #x = IndustryBuying('chopsaw_pages')
Add Comment
Please, Sign In to add comment