Guest User

Untitled

a guest
Jul 23rd, 2025
10
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.56 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. import argparse
  3. import glob
  4. import json
  5. import os
  6. import re
  7. import sys
  8.  
  9. import pandas as pd
  10.  
  11.  
  12. # --- Colour utilities -------------------------------------------------------
  13.  
  14. CSS_HEX = {
  15. 'black':'#000000','white':'#000000','red':'#ff0000','green':'#008000',
  16. 'blue':'#0000ff','yellow':'#ffff00','gray':'#808080','grey':'#808080',
  17. 'orange':'#ffa500','pink':'#ffc0cb','purple':'#800080','beige':'#f5f5dc',
  18. 'mint':'#98ff98','chocolate':'#d2691e','cherry':'#de3163','lemon':'#fff700',
  19. 'cyan':'#00ffff','gold':'#ffd700','silver':'#c0c0c0','sky':'#87ceeb',
  20. 'burgundy':'#800020','cream':'#fffdd0','olive':'#808000','teal':'#008080',
  21. 'maroon':'#800000','navy':'#000080'
  22. }
  23.  
  24. def hex_to_rgb(h: str) -> tuple[int,int,int]:
  25. h = h.lstrip('#')
  26. return tuple(int(h[i:i+2], 16) for i in (0,2,4))
  27.  
  28. def contrast_color(bg_hex: str, fg_hex: str) -> str:
  29. bg = hex_to_rgb(bg_hex); fg = hex_to_rgb(fg_hex)
  30. if sum((bg[i]-fg[i])**2 for i in range(3)) < 10000:
  31. return '#000000'
  32. return fg_hex
  33.  
  34. def get_css_color(name: str) -> str:
  35. return CSS_HEX.get(name.lower(), '#000000')
  36.  
  37.  
  38. # --- HTML parsing -----------------------------------------------------------
  39.  
  40. def extract_store_name_from_url(url: str) -> str:
  41. # This is a simplified extraction. A more robust solution might involve
  42. # parsing the URL and extracting the domain or a specific path segment.
  43. # For now, let's try to get a recognizable part of the URL.
  44. match = re.search(r'ebay\.com\.au/str/([^/?]+)', url)
  45. if match:
  46. return match.group(1).replace('-', ' ').title()
  47. return url # Fallback to full URL if store name cannot be extracted
  48.  
  49. def extract_page_url(html_path: str) -> str:
  50. txt = open(html_path, encoding='utf-8', errors='ignore').read()
  51. m = re.search(r'<link[^>]+rel=["\']canonical["\'][^>]+href=["\']([^"\']+)', txt, re.IGNORECASE)
  52. if m: return m.group(1)
  53. m = re.search(r'<meta[^>]+property=["\']og:url["\'][^>]+content=["\']([^"\']+)', txt, re.IGNORECASE)
  54. if m: return m.group(1)
  55. raise RuntimeError("Could not find page URL in HTML")
  56.  
  57. def _extract_from_html(html: str) -> dict | None:
  58. marker = 'VariationViewModel'
  59. pos = html.find(marker)
  60. if pos < 0: return None
  61. start = html.rfind('{', 0, pos)
  62. if start < 0: return None
  63. depth = 0
  64. for i, ch in enumerate(html[start:], start):
  65. if ch == '{': depth += 1
  66. elif ch == '}':
  67. depth -= 1
  68. if depth == 0:
  69. try:
  70. return json.loads(html[start:i+1])
  71. except json.JSONDecodeError:
  72. return None
  73. return None
  74.  
  75. def fetch_variation_model(path: str) -> dict:
  76. if not os.path.isfile(path):
  77. raise RuntimeError(f"Local file not found: {path}")
  78. raw = open(path, encoding='utf-8', errors='ignore').read()
  79. vm = _extract_from_html(raw)
  80. if vm is None:
  81. raise RuntimeError("VariationViewModel not found in HTML")
  82. return vm
  83.  
  84.  
  85. # --- DataFrame construction ------------------------------------------------
  86.  
  87. def build_dataframe(vm: dict) -> pd.DataFrame:
  88. menus = vm.get('selectMenus', [])
  89. labels = ['Filament Type', 'Colour']
  90. if len(menus) >= 2:
  91. def detect(lbl: str) -> str|None:
  92. ll = lbl.lower()
  93. if any(k in ll for k in ('filament','product','material','type')):
  94. return 'Filament Type'
  95. if any(k in ll for k in ('color','colour')):
  96. return 'Colour'
  97. return None
  98. l0 = detect(menus[0].get('displayLabel',''))
  99. l1 = detect(menus[1].get('displayLabel',''))
  100. if l0 and l1 and l0 != l1:
  101. labels = [l0, l1]
  102.  
  103. rows = []
  104. menu_map = vm['menuItemMap']
  105. for combo, vid in vm['variationCombinations'].items():
  106. if '_' not in combo: continue
  107. id0, id1 = combo.split('_',1)
  108. val0 = menu_map.get(id0,{}).get('valueName','')
  109. val1 = menu_map.get(id1,{}).get('valueName','')
  110. var = vm['variationsMap'].get(str(vid),{})
  111. price = var.get('binModel',{}).get('price',{}).get('value',{}).get('value', 0)
  112. oos = var.get('quantity',{}).get('outOfStock', False)
  113. rows.append({
  114. labels[0]: val0,
  115. labels[1]: val1,
  116. 'Price (AUD)': price,
  117. 'Out of Stock': 'Yes' if oos else 'No',
  118. 'pricing per spool if you buy 10': price * 0.6,
  119. 'with CBTJ20 20% discount': price * 0.6 * (1 - 0.20)
  120. })
  121.  
  122. df = pd.DataFrame(rows)
  123. if 'Filament Type' in df.columns and 'Colour' in df.columns:
  124. rest = [c for c in df.columns if c not in ('Filament Type','Colour')]
  125. df = df[['Filament Type','Colour'] + rest]
  126. return df.sort_values(['Filament Type','Colour']).reset_index(drop=True)
  127.  
  128.  
  129. # --- Fill colours ----------------------------------------------------------
  130.  
  131. def get_fill_color(name: str) -> str:
  132. n = name.lower()
  133. for base,col in [
  134. ('clear yellow','#ffffcc'),('clear red','#ffcccc'),
  135. ('clear green','#ccffcc'),('clear blue','#ccccff'),
  136. ('clear orange','#ffe5cc')
  137. ]:
  138. if base in n: return col
  139. if re.search(r'\bclear\b', n) or 'transparent' in n: return '#ffffff'
  140. if 'wood' in n: return '#f9f5eb'
  141. m = re.match(r'Dual-Color\s+(.+)', name, re.IGNORECASE)
  142. if m: return get_fill_color(re.split(r'\s*&\s*', m.group(1))[0])
  143. m = re.match(r'Tri-Color\s+(.+)', name, re.IGNORECASE)
  144. if m: return get_fill_color(re.split(r'\s*&\s*', m.group(1))[0])
  145. if 'rainbow' in n: return '#f5e0ff'
  146. for kw,col in [
  147. ('beige','#fff8e5'),('chocolate','#ffe4d1'),('cherry','#ffe5e5'),
  148. ('lemon','#ffffe5'),('cyan','#e5ffff'),('mint','#e5ffe5'),
  149. ('olive','#f0f0e0'),('burgundy','#f5e0e5'),('cream','#fffcf5'),
  150. ('teal','#e0f5f5'),('maroon','#f5e0e0'),('navy','#e0e5f5'),
  151. ('gold','#fff5cc'),('pink','#ffccdd'),('purple','#e5ccff'),
  152. ('orange','#ffe5cc'),('red','#ffcccc'),('green','#ccffcc'),
  153. ('blue','#ccccff'),('grey','#dddddd'),('gray','#dddddd')
  154. ]:
  155. if kw in n: return col
  156. if 'white' in n: return '#ffffff'
  157. if 'black' in n: return '#eeeeee'
  158. return '#f0f0f0'
  159.  
  160.  
  161. # --- Excel output ----------------------------------------------------------
  162.  
  163. def process_sheet(writer, df: pd.DataFrame, html_file: str, sheet_name: str):
  164. import xlsxwriter
  165. page_url = extract_page_url(html_file)
  166. df.to_excel(writer, sheet_name=sheet_name, startrow=1, index=False)
  167. wb, ws = writer.book, writer.sheets[sheet_name]
  168. nr, nc = df.shape
  169.  
  170. # hyperlink at A1
  171. link_fmt = wb.add_format({'font_color':'blue','underline':True})
  172. ws.write_url(0, 0, page_url, link_fmt, string=page_url)
  173.  
  174. # autofilter
  175. ws.autofilter(1, 0, nr+1, nc-1)
  176.  
  177. # currency formatting
  178. money_fmt = wb.add_format({'num_format':'$0.00'})
  179. for col in ['Price (AUD)', 'pricing per spool if you buy 10', 'with CBTJ20 20% discount']:
  180. idx = df.columns.get_loc(col)
  181. letter = xlsxwriter.utility.xl_col_to_name(idx)
  182. ws.set_column(f'{letter}:{letter}', None, money_fmt)
  183.  
  184. # Colour column formatting
  185. cidx = df.columns.get_loc('Colour')
  186. for r, val in enumerate(df['Colour'], start=2):
  187. fill = get_fill_color(val)
  188. base_fmt = wb.add_format({'bg_color': fill})
  189. nl = val.lower()
  190.  
  191. # Dual-Color
  192. m = re.match(r'(Dual-Color)\s+(.+)', val, re.IGNORECASE)
  193. if m:
  194. parts = re.split(r'\s*&\s*', m.group(2))
  195. fmt1 = wb.add_format({'font_color':contrast_color(fill,get_css_color(parts[0])),'bg_color':fill})
  196. fmt2 = wb.add_format({'font_color':contrast_color(fill,get_css_color(parts[1])),'bg_color':fill})
  197. ws.write_rich_string(r, cidx,
  198. fmt1, f"{m.group(1)} {parts[0]} & ",
  199. fmt2, parts[1],
  200. base_fmt)
  201. continue
  202.  
  203. # Tri-Color
  204. m = re.match(r'(Tri-Color)\s+(.+)', val, re.IGNORECASE)
  205. if m:
  206. parts = re.split(r'\s*&\s*', m.group(2))
  207. fmts = [wb.add_format({'font_color':contrast_color(fill,get_css_color(p)),'bg_color':fill})
  208. for p in parts]
  209. segments = [fmts[0], f"{m.group(1)} {parts[0]} & ",
  210. fmts[1], f"{parts[1]} & ",
  211. fmts[2], parts[2]]
  212. ws.write_rich_string(r, cidx, *segments, base_fmt)
  213. continue
  214.  
  215. # Rainbow
  216. if 'rainbow' in nl:
  217. m = re.match(r'(.*?)(Rainbow\d*|Rainbow)(.*)', val, re.IGNORECASE)
  218. if m:
  219. prefix, core, suffix = m.groups()
  220. parts = []
  221. if prefix: parts.append(prefix)
  222. rainbow_cols = ['#FF0000','#FF7F00','#FFFF00','#00FF00','#0000FF','#4B0082','#8F00FF']
  223. for i, ch in enumerate(core):
  224. fmt = wb.add_format({'font_color': contrast_color(fill,rainbow_cols[i % len(rainbow_cols)]),
  225. 'bg_color': fill})
  226. parts.extend([fmt, ch])
  227. if suffix: parts.append(suffix)
  228. ws.write_rich_string(r, cidx, *parts, base_fmt)
  229. continue
  230.  
  231. # Default
  232. ws.write(r, cidx, val, base_fmt)
  233.  
  234. # Out of Stock highlighting
  235. oidx = df.columns.get_loc('Out of Stock')
  236. red_fmt = writer.book.add_format({'bg_color':'#ffcccc','font_color':'#000000'})
  237. for r, v in enumerate(df['Out of Stock'], start=2):
  238. if str(v).strip().lower() == 'yes':
  239. ws.write(r, oidx, v, red_fmt)
  240.  
  241.  
  242. def process_sheet_combined(writer, df: pd.DataFrame, sheet_name: str):
  243. import xlsxwriter
  244. df.to_excel(writer, sheet_name=sheet_name, startrow=1, index=False)
  245. wb, ws = writer.book, writer.sheets[sheet_name]
  246. nr, nc = df.shape
  247.  
  248. # hyperlink for Store Link column
  249. link_fmt = wb.add_format({'font_color':'blue','underline':True})
  250. sidx = df.columns.get_loc('Store Link')
  251. sname_idx = df.columns.get_loc('Store Name') # Get index for the new 'Store Name' column
  252. for r, url_val in enumerate(df['Store Link'], start=2):
  253. store_name_display = df.iloc[r-2, sname_idx] # Get the store name from the DataFrame using iloc
  254. ws.write_url(r, sidx, url_val, link_fmt, string=store_name_display)
  255.  
  256. # autofilter
  257. ws.autofilter(1, 0, nr+1, nc-1)
  258.  
  259. # currency formatting
  260. money_fmt = wb.add_format({'num_format':'$0.00'})
  261. for col in ['Price (AUD)', 'pricing per spool if you buy 10', 'with CBTJ20 20% discount']:
  262. idx = df.columns.get_loc(col)
  263. letter = xlsxwriter.utility.xl_col_to_name(idx)
  264. ws.set_column(f'{letter}:{letter}', None, money_fmt)
  265.  
  266. # Colour column formatting
  267. if 'Colour' in df.columns:
  268. cidx = df.columns.get_loc('Colour')
  269. for r, val in enumerate(df['Colour'], start=2):
  270. fill = get_fill_color(val)
  271. base_fmt = wb.add_format({'bg_color': fill})
  272. nl = val.lower()
  273.  
  274. # Dual-Color
  275. m = re.match(r'(Dual-Color)\s+(.+)', val, re.IGNORECASE)
  276. if m:
  277. parts = re.split(r'\s*&\s*', m.group(2))
  278. fmt1 = wb.add_format({'font_color':contrast_color(fill,get_css_color(parts[0])),'bg_color':fill})
  279. fmt2 = wb.add_format({'font_color':contrast_color(fill,get_css_color(parts[1])),'bg_color':fill})
  280. ws.write_rich_string(r, cidx,
  281. fmt1, f"{m.group(1)} {parts[0]} & ",
  282. fmt2, parts[1],
  283. base_fmt)
  284. continue
  285.  
  286. # Tri-Color
  287. m = re.match(r'(Tri-Color)\s+(.+)', val, re.IGNORECASE)
  288. if m:
  289. parts = re.split(r'\s*&\s*', m.group(2))
  290. fmts = [wb.add_format({'font_color':contrast_color(fill,get_css_color(p)),'bg_color':fill})
  291. for p in parts]
  292. segments = [fmts[0], f"{m.group(1)} {parts[0]} & ",
  293. fmts[1], f"{parts[1]} & ",
  294. fmts[2], parts[2]]
  295. ws.write_rich_string(r, cidx, *segments, base_fmt)
  296. continue
  297.  
  298. # Rainbow
  299. if 'rainbow' in nl:
  300. m = re.match(r'(.*?)(Rainbow\d*|Rainbow)(.*)', val, re.IGNORECASE)
  301. if m:
  302. prefix, core, suffix = m.groups()
  303. parts = []
  304. if prefix: parts.append(prefix)
  305. rainbow_cols = ['#FF0000','#FF7F00','#FFFF00','#00FF00','#0000FF','#4B0082','#8F00FF']
  306. for i, ch in enumerate(core):
  307. fmt = wb.add_format({'font_color': contrast_color(fill,rainbow_cols[i % len(rainbow_cols)]),
  308. 'bg_color': fill})
  309. parts.extend([fmt, ch])
  310. if suffix: parts.append(suffix)
  311. ws.write_rich_string(r, cidx, *parts, base_fmt)
  312. continue
  313.  
  314. # Default
  315. ws.write(r, cidx, val, base_fmt)
  316.  
  317. # Out of Stock highlighting
  318. oidx = df.columns.get_loc('Out of Stock')
  319. red_fmt = writer.book.add_format({'bg_color':'#ffcccc','font_color':'#000000'})
  320. for r, v in enumerate(df['Out of Stock'], start=2):
  321. if str(v).strip().lower() == 'yes':
  322. ws.write(r, oidx, v, red_fmt)
  323.  
  324.  
  325. def main():
  326. p = argparse.ArgumentParser(description="Extract eBay variation data → pastel-coloured Excel")
  327. p.add_argument('sources', nargs='*', help="HTML file(s) to process")
  328. p.add_argument('-all', action='store_true', help="Process all .html files in folder")
  329. p.add_argument('-o','--output', help="Output .xlsx filename (single-file mode)")
  330. args = p.parse_args()
  331.  
  332. if args.all:
  333. html_files = glob.glob("*.html")
  334. if not html_files:
  335. print("No .html files found.", file=sys.stderr)
  336. sys.exit(1)
  337. combined_writer = pd.ExcelWriter("combined.xlsx", engine='xlsxwriter')
  338. all_dataframes = []
  339. individual_sheet_details = [] # To store (df, html_file, sheet_name) for later writing
  340. else:
  341. html_files = args.sources
  342. if not html_files:
  343. p.error("Specify HTML file(s) or use -all")
  344.  
  345. for html in html_files:
  346. if not os.path.isfile(html):
  347. print(f"Skipping missing file: {html}", file=sys.stderr)
  348. continue
  349.  
  350. base = os.path.splitext(os.path.basename(html))[0]
  351. out_file = f"{base}.xlsx" if args.all else (args.output or f"{base}.xlsx")
  352.  
  353. try:
  354. vm = fetch_variation_model(html)
  355. df = build_dataframe(vm)
  356. page_url = extract_page_url(html) # Extract URL here
  357.  
  358. # write individual workbook if not in -all mode
  359. if not args.all:
  360. writer = pd.ExcelWriter(out_file, engine='xlsxwriter')
  361. process_sheet(writer, df, html, sheet_name="Variations")
  362. writer.close()
  363. print(f"Wrote {len(df)} rows to {out_file}")
  364.  
  365. # Collect data for combined workbook
  366. if args.all:
  367. df_with_link = df.copy()
  368. df_with_link.insert(0, 'Store Link', page_url)
  369. df_with_link.insert(1, 'Store Name', base.replace('_', ' ').title()) # Use base filename as store name
  370. all_dataframes.append(df_with_link)
  371. individual_sheet_details.append((df, html, base[:31])) # Store for later
  372.  
  373. except Exception as e:
  374. print(f"Error processing {html}: {e}", file=sys.stderr)
  375.  
  376. if args.all:
  377. if all_dataframes:
  378. combined_df = pd.concat(all_dataframes, ignore_index=True)
  379. # Write the combined dataframe to the first sheet
  380. process_sheet_combined(combined_writer, combined_df, sheet_name="All Filaments")
  381. print(f"Wrote {len(combined_df)} rows to 'All Filaments' sheet in combined.xlsx")
  382.  
  383. # Now write individual sheets after the combined sheet
  384. for df_ind, html_ind, sheet_name_ind in individual_sheet_details:
  385. try:
  386. process_sheet(combined_writer, df_ind, html_ind, sheet_name=sheet_name_ind)
  387. except Exception as e:
  388. print(f"Error processing {html_ind} for combined workbook (individual sheet): {e}", file=sys.stderr)
  389.  
  390. combined_writer.close()
  391. print("Wrote combined workbook to combined.xlsx")
  392.  
  393.  
  394. if __name__ == '__main__':
  395. main()
  396.  
Advertisement
Add Comment
Please, Sign In to add comment