#!/usr/bin/env python3
"""Re-scrape CURRENT live state of all URLs (pages+products+categories) for before/after comparison."""
import json, re, urllib.request, ssl, html, concurrent.futures, time
ctx=ssl.create_default_context(); ctx.check_hostname=False; ctx.verify_mode=ssl.CERT_NONE
def c(s): return html.unescape(re.sub('<[^>]+>','',s or '')).strip()
def fetch(u):
    try:
        req=urllib.request.Request(u+('&' if '?' in u else '?')+'cb='+str(int(time.time())),
            headers={'User-Agent':'Mozilla/5.0','Cache-Control':'no-cache'})
        return urllib.request.urlopen(req,timeout=30,context=ctx).read().decode('utf-8','ignore')
    except Exception as e: return f'__ERR__{e}'
def parse(u,h,label):
    if h.startswith('__ERR__'): return {'url':u,'name':label[1],'type':label[0],'error':h[7:]}
    head=h.split('</head>')[0] if '</head>' in h else h
    t=re.search(r'<title[^>]*>(.*?)</title>',head,re.I|re.S)
    t=c(t.group(1)) if t else ''
    d=re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']',head,re.I|re.S)
    d=c(d.group(1)) if d else ''
    h1=[c(x) for x in re.findall(r'<h1[^>]*>(.*?)</h1>',h,re.I|re.S) if c(x)]
    return {'url':u,'name':label[1],'type':label[0],'title':t,'title_len':len(t),
            'desc':d,'desc_len':len(d),'h1_count':len(h1)}

urls={}
for p in json.load(open('pages.json')): urls[p['link']]=('Page',c(p['title']['rendered']))
for f in ['products1.json','products2.json']:
    for p in json.load(open(f)): urls[p['link']]=('Product',c(p['title']['rendered']))
for ct in json.load(open('categories.json')):
    if ct['slug']!='uncategorized': urls[ct['link']]=('Category',html.unescape(ct['name']))

print('Re-scraping',len(urls),'URLs (current live state)...')
res=[]
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
    futs={ex.submit(fetch,u):u for u in urls}
    for fu in concurrent.futures.as_completed(futs):
        u=futs[fu]; res.append(parse(u,fu.result(),urls[u]))
json.dump(res, open('current_scrape.json','w'), indent=1)
ok=[r for r in res if 'error' not in r]
print('OK:',len(ok),'Errors:',len(res)-len(ok))
print('Titles <=60 now:',sum(1 for r in ok if 0<r['title_len']<=60),'/',len(ok))
print('Desc 70-160 now:',sum(1 for r in ok if 70<=r['desc_len']<=160),'/',len(ok))