#!/usr/bin/env python3
import json, re, urllib.request, ssl, concurrent.futures

ctx = ssl.create_default_context(); ctx.check_hostname=False; ctx.verify_mode=ssl.CERT_NONE

def fetch(url):
    try:
        req=urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 SEO-Audit'})
        with urllib.request.urlopen(req, timeout=25, context=ctx) as r:
            return r.read().decode('utf-8','ignore')
    except Exception as e:
        return f"__ERROR__{e}"

def parse(url, html):
    if html.startswith('__ERROR__'):
        return {'url':url,'error':html[9:]}
    head = html.split('</head>')[0] if '</head>' in html else html
    def m(pat, s=head, fl=re.I|re.S):
        x=re.search(pat,s,fl); return x.group(1).strip() if x else ''
    title = m(r'<title[^>]*>(.*?)</title>')
    desc  = m(r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']') or \
            m(r'<meta[^>]+content=["\'](.*?)["\'][^>]+name=["\']description["\']')
    canon = m(r'<link[^>]+rel=["\']canonical["\'][^>]+href=["\'](.*?)["\']')
    ogt   = m(r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\'](.*?)["\']')
    ogd   = m(r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\'](.*?)["\']')
    ogi   = m(r'<meta[^>]+property=["\']og:image["\'][^>]+content=["\'](.*?)["\']')
    robots= m(r'<meta[^>]+name=["\']robots["\'][^>]+content=["\'](.*?)["\']')
    h1s   = re.findall(r'<h1[^>]*>(.*?)</h1>', html, re.I|re.S)
    h1s   = [re.sub(r'<[^>]+>','',h).strip() for h in h1s]
    h1s   = [h for h in h1s if h]
    schema= 'YES' if 'application/ld+json' in html else 'NO'
    fbpix = 'YES' if ('connect.facebook.net' in html or 'fbq(' in html) else 'NO'
    ga    = 'YES' if ('gtag(' in html or 'googletagmanager' in html or 'google-analytics' in html) else 'NO'
    imgs  = re.findall(r'<img[^>]*>', html, re.I)
    img_no_alt = sum(1 for i in imgs if not re.search(r'alt=["\'][^"\']+["\']', i, re.I))
    return {'url':url,'title':title,'title_len':len(title),'desc':desc,'desc_len':len(desc),
            'canonical':canon,'og_title':ogt,'og_desc':ogd,'og_image':ogi,'robots':robots,
            'h1_count':len(h1s),'h1s':h1s,'schema':schema,'fb_pixel':fbpix,'analytics':ga,
            'img_total':len(imgs),'img_missing_alt':img_no_alt}

# Build URL list: all pages + all products
urls=[]
labels={}
for f in ['pages.json']:
    for p in json.load(open(f)):
        urls.append(p['link']); labels[p['link']]=('PAGE', p['title']['rendered'])
for f in ['products1.json','products2.json']:
    for p in json.load(open(f)):
        urls.append(p['link']); labels[p['link']]=('PRODUCT', p['title']['rendered'])

print(f"Scraping {len(urls)} URLs...")
results=[]
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
    futs={ex.submit(fetch,u):u for u in urls}
    for fu in concurrent.futures.as_completed(futs):
        u=futs[fu]; r=parse(u, fu.result())
        r['type'],r['name']=labels[u]; results.append(r)

json.dump(results, open('seo_scrape.json','w'), indent=1)

# Summary
ok=[r for r in results if 'error' not in r]
print(f"OK: {len(ok)}  Errors: {len(results)-len(ok)}")
print("No meta description:", sum(1 for r in ok if not r['desc']))
print("Multiple H1s:", sum(1 for r in ok if r['h1_count']>1))
print("Zero H1:", sum(1 for r in ok if r['h1_count']==0))
print("No schema:", sum(1 for r in ok if r['schema']=='NO'))
print("No OG title:", sum(1 for r in ok if not r['og_title']))
print("FB Pixel present:", sum(1 for r in ok if r['fb_pixel']=='YES'))
print("Analytics present:", sum(1 for r in ok if r['analytics']=='YES'))
print("Title >60 chars:", sum(1 for r in ok if r['title_len']>60))
print("Desc >160 chars:", sum(1 for r in ok if r['desc_len']>160))