Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import argparse
- import glob
- import json
- import os
- import re
- import sys
- import pandas as pd
- # --- Colour utilities -------------------------------------------------------
- CSS_HEX = {
- 'black':'#000000','white':'#000000','red':'#ff0000','green':'#008000',
- 'blue':'#0000ff','yellow':'#ffff00','gray':'#808080','grey':'#808080',
- 'orange':'#ffa500','pink':'#ffc0cb','purple':'#800080','beige':'#f5f5dc',
- 'mint':'#98ff98','chocolate':'#d2691e','cherry':'#de3163','lemon':'#fff700',
- 'cyan':'#00ffff','gold':'#ffd700','silver':'#c0c0c0','sky':'#87ceeb',
- 'burgundy':'#800020','cream':'#fffdd0','olive':'#808000','teal':'#008080',
- 'maroon':'#800000','navy':'#000080'
- }
- def hex_to_rgb(h: str) -> tuple[int,int,int]:
- h = h.lstrip('#')
- return tuple(int(h[i:i+2], 16) for i in (0,2,4))
- def contrast_color(bg_hex: str, fg_hex: str) -> str:
- bg = hex_to_rgb(bg_hex); fg = hex_to_rgb(fg_hex)
- if sum((bg[i]-fg[i])**2 for i in range(3)) < 10000:
- return '#000000'
- return fg_hex
- def get_css_color(name: str) -> str:
- return CSS_HEX.get(name.lower(), '#000000')
- # --- HTML parsing -----------------------------------------------------------
- def extract_store_name_from_url(url: str) -> str:
- # This is a simplified extraction. A more robust solution might involve
- # parsing the URL and extracting the domain or a specific path segment.
- # For now, let's try to get a recognizable part of the URL.
- match = re.search(r'ebay\.com\.au/str/([^/?]+)', url)
- if match:
- return match.group(1).replace('-', ' ').title()
- return url # Fallback to full URL if store name cannot be extracted
- def extract_page_url(html_path: str) -> str:
- txt = open(html_path, encoding='utf-8', errors='ignore').read()
- m = re.search(r'<link[^>]+rel=["\']canonical["\'][^>]+href=["\']([^"\']+)', txt, re.IGNORECASE)
- if m: return m.group(1)
- m = re.search(r'<meta[^>]+property=["\']og:url["\'][^>]+content=["\']([^"\']+)', txt, re.IGNORECASE)
- if m: return m.group(1)
- raise RuntimeError("Could not find page URL in HTML")
- def _extract_from_html(html: str) -> dict | None:
- marker = 'VariationViewModel'
- pos = html.find(marker)
- if pos < 0: return None
- start = html.rfind('{', 0, pos)
- if start < 0: return None
- depth = 0
- for i, ch in enumerate(html[start:], start):
- if ch == '{': depth += 1
- elif ch == '}':
- depth -= 1
- if depth == 0:
- try:
- return json.loads(html[start:i+1])
- except json.JSONDecodeError:
- return None
- return None
- def fetch_variation_model(path: str) -> dict:
- if not os.path.isfile(path):
- raise RuntimeError(f"Local file not found: {path}")
- raw = open(path, encoding='utf-8', errors='ignore').read()
- vm = _extract_from_html(raw)
- if vm is None:
- raise RuntimeError("VariationViewModel not found in HTML")
- return vm
- # --- DataFrame construction ------------------------------------------------
- def build_dataframe(vm: dict) -> pd.DataFrame:
- menus = vm.get('selectMenus', [])
- labels = ['Filament Type', 'Colour']
- if len(menus) >= 2:
- def detect(lbl: str) -> str|None:
- ll = lbl.lower()
- if any(k in ll for k in ('filament','product','material','type')):
- return 'Filament Type'
- if any(k in ll for k in ('color','colour')):
- return 'Colour'
- return None
- l0 = detect(menus[0].get('displayLabel',''))
- l1 = detect(menus[1].get('displayLabel',''))
- if l0 and l1 and l0 != l1:
- labels = [l0, l1]
- rows = []
- menu_map = vm['menuItemMap']
- for combo, vid in vm['variationCombinations'].items():
- if '_' not in combo: continue
- id0, id1 = combo.split('_',1)
- val0 = menu_map.get(id0,{}).get('valueName','')
- val1 = menu_map.get(id1,{}).get('valueName','')
- var = vm['variationsMap'].get(str(vid),{})
- price = var.get('binModel',{}).get('price',{}).get('value',{}).get('value', 0)
- oos = var.get('quantity',{}).get('outOfStock', False)
- rows.append({
- labels[0]: val0,
- labels[1]: val1,
- 'Price (AUD)': price,
- 'Out of Stock': 'Yes' if oos else 'No',
- 'pricing per spool if you buy 10': price * 0.6,
- 'with CBTJ20 20% discount': price * 0.6 * (1 - 0.20)
- })
- df = pd.DataFrame(rows)
- if 'Filament Type' in df.columns and 'Colour' in df.columns:
- rest = [c for c in df.columns if c not in ('Filament Type','Colour')]
- df = df[['Filament Type','Colour'] + rest]
- return df.sort_values(['Filament Type','Colour']).reset_index(drop=True)
- # --- Fill colours ----------------------------------------------------------
- def get_fill_color(name: str) -> str:
- n = name.lower()
- for base,col in [
- ('clear yellow','#ffffcc'),('clear red','#ffcccc'),
- ('clear green','#ccffcc'),('clear blue','#ccccff'),
- ('clear orange','#ffe5cc')
- ]:
- if base in n: return col
- if re.search(r'\bclear\b', n) or 'transparent' in n: return '#ffffff'
- if 'wood' in n: return '#f9f5eb'
- m = re.match(r'Dual-Color\s+(.+)', name, re.IGNORECASE)
- if m: return get_fill_color(re.split(r'\s*&\s*', m.group(1))[0])
- m = re.match(r'Tri-Color\s+(.+)', name, re.IGNORECASE)
- if m: return get_fill_color(re.split(r'\s*&\s*', m.group(1))[0])
- if 'rainbow' in n: return '#f5e0ff'
- for kw,col in [
- ('beige','#fff8e5'),('chocolate','#ffe4d1'),('cherry','#ffe5e5'),
- ('lemon','#ffffe5'),('cyan','#e5ffff'),('mint','#e5ffe5'),
- ('olive','#f0f0e0'),('burgundy','#f5e0e5'),('cream','#fffcf5'),
- ('teal','#e0f5f5'),('maroon','#f5e0e0'),('navy','#e0e5f5'),
- ('gold','#fff5cc'),('pink','#ffccdd'),('purple','#e5ccff'),
- ('orange','#ffe5cc'),('red','#ffcccc'),('green','#ccffcc'),
- ('blue','#ccccff'),('grey','#dddddd'),('gray','#dddddd')
- ]:
- if kw in n: return col
- if 'white' in n: return '#ffffff'
- if 'black' in n: return '#eeeeee'
- return '#f0f0f0'
- # --- Excel output ----------------------------------------------------------
- def process_sheet(writer, df: pd.DataFrame, html_file: str, sheet_name: str):
- import xlsxwriter
- page_url = extract_page_url(html_file)
- df.to_excel(writer, sheet_name=sheet_name, startrow=1, index=False)
- wb, ws = writer.book, writer.sheets[sheet_name]
- nr, nc = df.shape
- # hyperlink at A1
- link_fmt = wb.add_format({'font_color':'blue','underline':True})
- ws.write_url(0, 0, page_url, link_fmt, string=page_url)
- # autofilter
- ws.autofilter(1, 0, nr+1, nc-1)
- # currency formatting
- money_fmt = wb.add_format({'num_format':'$0.00'})
- for col in ['Price (AUD)', 'pricing per spool if you buy 10', 'with CBTJ20 20% discount']:
- idx = df.columns.get_loc(col)
- letter = xlsxwriter.utility.xl_col_to_name(idx)
- ws.set_column(f'{letter}:{letter}', None, money_fmt)
- # Colour column formatting
- cidx = df.columns.get_loc('Colour')
- for r, val in enumerate(df['Colour'], start=2):
- fill = get_fill_color(val)
- base_fmt = wb.add_format({'bg_color': fill})
- nl = val.lower()
- # Dual-Color
- m = re.match(r'(Dual-Color)\s+(.+)', val, re.IGNORECASE)
- if m:
- parts = re.split(r'\s*&\s*', m.group(2))
- fmt1 = wb.add_format({'font_color':contrast_color(fill,get_css_color(parts[0])),'bg_color':fill})
- fmt2 = wb.add_format({'font_color':contrast_color(fill,get_css_color(parts[1])),'bg_color':fill})
- ws.write_rich_string(r, cidx,
- fmt1, f"{m.group(1)} {parts[0]} & ",
- fmt2, parts[1],
- base_fmt)
- continue
- # Tri-Color
- m = re.match(r'(Tri-Color)\s+(.+)', val, re.IGNORECASE)
- if m:
- parts = re.split(r'\s*&\s*', m.group(2))
- fmts = [wb.add_format({'font_color':contrast_color(fill,get_css_color(p)),'bg_color':fill})
- for p in parts]
- segments = [fmts[0], f"{m.group(1)} {parts[0]} & ",
- fmts[1], f"{parts[1]} & ",
- fmts[2], parts[2]]
- ws.write_rich_string(r, cidx, *segments, base_fmt)
- continue
- # Rainbow
- if 'rainbow' in nl:
- m = re.match(r'(.*?)(Rainbow\d*|Rainbow)(.*)', val, re.IGNORECASE)
- if m:
- prefix, core, suffix = m.groups()
- parts = []
- if prefix: parts.append(prefix)
- rainbow_cols = ['#FF0000','#FF7F00','#FFFF00','#00FF00','#0000FF','#4B0082','#8F00FF']
- for i, ch in enumerate(core):
- fmt = wb.add_format({'font_color': contrast_color(fill,rainbow_cols[i % len(rainbow_cols)]),
- 'bg_color': fill})
- parts.extend([fmt, ch])
- if suffix: parts.append(suffix)
- ws.write_rich_string(r, cidx, *parts, base_fmt)
- continue
- # Default
- ws.write(r, cidx, val, base_fmt)
- # Out of Stock highlighting
- oidx = df.columns.get_loc('Out of Stock')
- red_fmt = writer.book.add_format({'bg_color':'#ffcccc','font_color':'#000000'})
- for r, v in enumerate(df['Out of Stock'], start=2):
- if str(v).strip().lower() == 'yes':
- ws.write(r, oidx, v, red_fmt)
- def process_sheet_combined(writer, df: pd.DataFrame, sheet_name: str):
- import xlsxwriter
- df.to_excel(writer, sheet_name=sheet_name, startrow=1, index=False)
- wb, ws = writer.book, writer.sheets[sheet_name]
- nr, nc = df.shape
- # hyperlink for Store Link column
- link_fmt = wb.add_format({'font_color':'blue','underline':True})
- sidx = df.columns.get_loc('Store Link')
- sname_idx = df.columns.get_loc('Store Name') # Get index for the new 'Store Name' column
- for r, url_val in enumerate(df['Store Link'], start=2):
- store_name_display = df.iloc[r-2, sname_idx] # Get the store name from the DataFrame using iloc
- ws.write_url(r, sidx, url_val, link_fmt, string=store_name_display)
- # autofilter
- ws.autofilter(1, 0, nr+1, nc-1)
- # currency formatting
- money_fmt = wb.add_format({'num_format':'$0.00'})
- for col in ['Price (AUD)', 'pricing per spool if you buy 10', 'with CBTJ20 20% discount']:
- idx = df.columns.get_loc(col)
- letter = xlsxwriter.utility.xl_col_to_name(idx)
- ws.set_column(f'{letter}:{letter}', None, money_fmt)
- # Colour column formatting
- if 'Colour' in df.columns:
- cidx = df.columns.get_loc('Colour')
- for r, val in enumerate(df['Colour'], start=2):
- fill = get_fill_color(val)
- base_fmt = wb.add_format({'bg_color': fill})
- nl = val.lower()
- # Dual-Color
- m = re.match(r'(Dual-Color)\s+(.+)', val, re.IGNORECASE)
- if m:
- parts = re.split(r'\s*&\s*', m.group(2))
- fmt1 = wb.add_format({'font_color':contrast_color(fill,get_css_color(parts[0])),'bg_color':fill})
- fmt2 = wb.add_format({'font_color':contrast_color(fill,get_css_color(parts[1])),'bg_color':fill})
- ws.write_rich_string(r, cidx,
- fmt1, f"{m.group(1)} {parts[0]} & ",
- fmt2, parts[1],
- base_fmt)
- continue
- # Tri-Color
- m = re.match(r'(Tri-Color)\s+(.+)', val, re.IGNORECASE)
- if m:
- parts = re.split(r'\s*&\s*', m.group(2))
- fmts = [wb.add_format({'font_color':contrast_color(fill,get_css_color(p)),'bg_color':fill})
- for p in parts]
- segments = [fmts[0], f"{m.group(1)} {parts[0]} & ",
- fmts[1], f"{parts[1]} & ",
- fmts[2], parts[2]]
- ws.write_rich_string(r, cidx, *segments, base_fmt)
- continue
- # Rainbow
- if 'rainbow' in nl:
- m = re.match(r'(.*?)(Rainbow\d*|Rainbow)(.*)', val, re.IGNORECASE)
- if m:
- prefix, core, suffix = m.groups()
- parts = []
- if prefix: parts.append(prefix)
- rainbow_cols = ['#FF0000','#FF7F00','#FFFF00','#00FF00','#0000FF','#4B0082','#8F00FF']
- for i, ch in enumerate(core):
- fmt = wb.add_format({'font_color': contrast_color(fill,rainbow_cols[i % len(rainbow_cols)]),
- 'bg_color': fill})
- parts.extend([fmt, ch])
- if suffix: parts.append(suffix)
- ws.write_rich_string(r, cidx, *parts, base_fmt)
- continue
- # Default
- ws.write(r, cidx, val, base_fmt)
- # Out of Stock highlighting
- oidx = df.columns.get_loc('Out of Stock')
- red_fmt = writer.book.add_format({'bg_color':'#ffcccc','font_color':'#000000'})
- for r, v in enumerate(df['Out of Stock'], start=2):
- if str(v).strip().lower() == 'yes':
- ws.write(r, oidx, v, red_fmt)
- def main():
- p = argparse.ArgumentParser(description="Extract eBay variation data → pastel-coloured Excel")
- p.add_argument('sources', nargs='*', help="HTML file(s) to process")
- p.add_argument('-all', action='store_true', help="Process all .html files in folder")
- p.add_argument('-o','--output', help="Output .xlsx filename (single-file mode)")
- args = p.parse_args()
- if args.all:
- html_files = glob.glob("*.html")
- if not html_files:
- print("No .html files found.", file=sys.stderr)
- sys.exit(1)
- combined_writer = pd.ExcelWriter("combined.xlsx", engine='xlsxwriter')
- all_dataframes = []
- individual_sheet_details = [] # To store (df, html_file, sheet_name) for later writing
- else:
- html_files = args.sources
- if not html_files:
- p.error("Specify HTML file(s) or use -all")
- for html in html_files:
- if not os.path.isfile(html):
- print(f"Skipping missing file: {html}", file=sys.stderr)
- continue
- base = os.path.splitext(os.path.basename(html))[0]
- out_file = f"{base}.xlsx" if args.all else (args.output or f"{base}.xlsx")
- try:
- vm = fetch_variation_model(html)
- df = build_dataframe(vm)
- page_url = extract_page_url(html) # Extract URL here
- # write individual workbook if not in -all mode
- if not args.all:
- writer = pd.ExcelWriter(out_file, engine='xlsxwriter')
- process_sheet(writer, df, html, sheet_name="Variations")
- writer.close()
- print(f"Wrote {len(df)} rows to {out_file}")
- # Collect data for combined workbook
- if args.all:
- df_with_link = df.copy()
- df_with_link.insert(0, 'Store Link', page_url)
- df_with_link.insert(1, 'Store Name', base.replace('_', ' ').title()) # Use base filename as store name
- all_dataframes.append(df_with_link)
- individual_sheet_details.append((df, html, base[:31])) # Store for later
- except Exception as e:
- print(f"Error processing {html}: {e}", file=sys.stderr)
- if args.all:
- if all_dataframes:
- combined_df = pd.concat(all_dataframes, ignore_index=True)
- # Write the combined dataframe to the first sheet
- process_sheet_combined(combined_writer, combined_df, sheet_name="All Filaments")
- print(f"Wrote {len(combined_df)} rows to 'All Filaments' sheet in combined.xlsx")
- # Now write individual sheets after the combined sheet
- for df_ind, html_ind, sheet_name_ind in individual_sheet_details:
- try:
- process_sheet(combined_writer, df_ind, html_ind, sheet_name=sheet_name_ind)
- except Exception as e:
- print(f"Error processing {html_ind} for combined workbook (individual sheet): {e}", file=sys.stderr)
- combined_writer.close()
- print("Wrote combined workbook to combined.xlsx")
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment