import os import re import hashlib from html.parser import HTMLParser from .security import get_safe_path, PROJECTS_DIR from .state import load_ledger, update_failure_count class TagBalanceParser(HTMLParser): def __init__(self): super().__init__() self.stack = [] self.errors = [] # Exclude 'p' and 'font' to avoid false positives on standard HTML5 tag-omissions self.tracked_tags = {'table', 'tr', 'td', 'div', 'nav', 'header', 'footer', 'main', 'section'} def handle_starttag(self, tag, attrs): if tag in self.tracked_tags: self.stack.append((tag, self.getpos())) def handle_endtag(self, tag): if tag in self.tracked_tags: if not self.stack: self.errors.append(f"Orphaned near line {self.getpos()[0]}") else: last_open, pos = self.stack.pop() if last_open != tag: self.errors.append(f"Mismatched tag: <{last_open}> (L{pos[0]}) closed by (L{self.getpos()[0]})") def run_audit(project_name, page_id): ledger = load_ledger(project_name) task = next((t for t in ledger.get("task_queue", []) if t["page_id"] == page_id), None) if not task: return {"status": "error", "reason": "Task not found."} dest_file = get_safe_path(PROJECTS_DIR, project_name, "dist", task["filename"]) if not os.path.exists(dest_file): return {"status": "error", "reason": "Compiled file not found on disk."} # 1. Hash Freshness Check with open(dest_file, "r", encoding="utf-8") as f: disk_content = f.read() disk_hash = hashlib.sha256(disk_content.encode('utf-8')).hexdigest() ledger_hash = ledger.get("hashes", {}).get(page_id, "") if disk_hash != ledger_hash: return {"status": "error", "reason": "Security Fault: Disk hash does not match Ledger hash. Stale or tampered file."} errors = [] # 2. Tag Balance Audit parser = TagBalanceParser() try: parser.feed(disk_content) errors.extend(parser.errors) if parser.stack: for unclosed, pos in parser.stack: errors.append(f"Unclosed <{unclosed}> near line {pos[0]}") except Exception as e: errors.append(f"Parser failure: {str(e)}") # 3. Link Matrix Audit allowed_files = {t["filename"] for t in ledger.get("task_queue", [])} links = re.findall(r'href=["\']([^"\']+)["\']', disk_content, re.IGNORECASE) for link in links: if not link.startswith(("http", "https", "mailto:", "#")) and link not in allowed_files: errors.append(f"Dead Link Found: '{link}' is not in the project task queue.") # 4. Three-Strike Circuit Breaker if errors: strikes = update_failure_count(project_name, page_id, increment=True) if strikes >= 3: return { "status": "circuit_breaker", "reason": f"Page '{page_id}' failed structural audit 3 consecutive times. Escalating to human.", "errors": errors } return {"status": "error", "strikes": strikes, "errors": errors} # Reset strikes on success update_failure_count(project_name, page_id, increment=False) return {"status": "success", "message": "Audit passed. File is structurally sound."}