#!/usr/bin/env python3
"""
Pattern Analysis Script
Analyzes crawled data to identify common patterns and components
"""

import json
import re
from collections import Counter, defaultdict
from typing import Dict, List, Set
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PatternAnalyzer:
    def __init__(self, crawled_data: Dict):
        self.data = crawled_data
        self.patterns = {
            'common': {},
            'likely': {},
            'nice_to_have': {}
        }
        
    def analyze_navigation_patterns(self) -> Dict:
        """Analyze common navigation structures"""
        nav_items = Counter()
        nav_structures = []
        
        for domain, site_data in self.data.items():
            for page in site_data.get('pages', []):
                nav_elements = page.get('extracted', {}).get('navigation', [])
                for nav_text in nav_elements:
                    # Clean and normalize nav text
                    items = re.split(r'[|\n\t]', nav_text)
                    for item in items:
                        clean_item = item.strip().lower()
                        if clean_item and len(clean_item) < 50:
                            nav_items[clean_item] += 1
                            
        return dict(nav_items.most_common(20))
    
    def analyze_hero_patterns(self) -> Dict:
        """Analyze hero section patterns"""
        hero_patterns = {
            'headlines': Counter(),
            'subheadings': Counter(),
            'cta_buttons': Counter()
        }
        
        for domain, site_data in self.data.items():
            for page in site_data.get('pages', []):
                # Analyze hero sections
                hero_elements = page.get('extracted', {}).get('hero', [])
                headings = page.get('extracted', {}).get('headings', [])
                ctas = page.get('extracted', {}).get('ctas', [])
                
                # Categorize headings by position (likely hero vs other)
                if headings:
                    main_heading = headings[0] if headings else ""
                    hero_patterns['headlines'][main_heading.lower()] += 1
                    
                # Analyze CTAs
                for cta in ctas:
                    clean_cta = cta.strip().lower()
                    if any(word in clean_cta for word in ['apply', 'start', 'pre-approve', 'get', 'contact']):
                        hero_patterns['cta_buttons'][clean_cta] += 1
        
        return {k: dict(v.most_common(10)) for k, v in hero_patterns.items()}
    
    def analyze_loan_products(self) -> Dict:
        """Analyze common loan product offerings"""
        loan_types = Counter()
        loan_keywords = [
            'conventional', 'fha', 'va', 'usda', 'jumbo', 'non-qm',
            'purchase', 'refinance', 'cash-out', 'heloc', 'construction',
            'investment', 'first-time', 'renovation'
        ]
        
        for domain, site_data in self.data.items():
            for page in site_data.get('pages', []):
                text_content = ' '.join(page.get('extracted', {}).get('copy_blocks', []))
                text_content += ' '.join(page.get('extracted', {}).get('headings', []))
                text_content = text_content.lower()
                
                for keyword in loan_keywords:
                    if keyword in text_content:
                        loan_types[keyword] += 1
        
        return dict(loan_types.most_common())
    
    def analyze_color_patterns(self) -> Dict:
        """Analyze common color schemes"""
        all_colors = Counter()
        
        for domain, site_data in self.data.items():
            for page in site_data.get('pages', []):
                styles = page.get('extracted', {}).get('styles', {})
                colors = styles.get('colors', [])
                for color in colors:
                    # Normalize hex colors
                    color = color.lower().strip()
                    if re.match(r'^#[0-9a-f]{3,6}$', color):
                        all_colors[color] += 1
        
        return dict(all_colors.most_common(20))
    
    def analyze_copy_patterns(self) -> Dict:
        """Analyze common copy patterns and phrases"""
        common_phrases = Counter()
        value_props = Counter()
        
        # Common mortgage industry phrases
        mortgage_phrases = [
            'competitive rates', 'fast approval', 'experienced loan officer',
            'personalized service', 'local lender', 'equal housing',
            'nmls', 'licensed', 'pre-approved', 'closing costs',
            'down payment', 'credit score', 'debt-to-income'
        ]
        
        for domain, site_data in self.data.items():
            for page in site_data.get('pages', []):
                text_content = ' '.join(page.get('extracted', {}).get('copy_blocks', []))
                text_content = text_content.lower()
                
                for phrase in mortgage_phrases:
                    if phrase in text_content:
                        common_phrases[phrase] += 1
                
                # Extract sentences that might be value propositions
                sentences = re.split(r'[.!?]', text_content)
                for sentence in sentences:
                    sentence = sentence.strip()
                    if (20 < len(sentence) < 100 and 
                        any(word in sentence for word in ['we', 'our', 'best', 'fast', 'easy', 'competitive'])):
                        value_props[sentence] += 1
        
        return {
            'phrases': dict(common_phrases.most_common(15)),
            'value_props': dict(value_props.most_common(10))
        }
    
    def analyze_compliance_patterns(self) -> Dict:
        """Analyze compliance and legal patterns"""
        compliance_items = Counter()
        
        compliance_keywords = [
            'nmls', 'equal housing', 'fair housing', 'privacy policy',
            'terms of use', 'disclosures', 'licensing', 'regulated by',
            'member fdic', 'complaints', 'accessibility'
        ]
        
        for domain, site_data in self.data.items():
            for page in site_data.get('pages', []):
                text_content = ' '.join(page.get('extracted', {}).get('copy_blocks', []))
                text_content = text_content.lower()
                
                for keyword in compliance_keywords:
                    if keyword in text_content:
                        compliance_items[keyword] += 1
        
        return dict(compliance_items.most_common())
    
    def categorize_patterns(self, pattern_data: Dict, min_sites: Dict) -> None:
        """Categorize patterns by frequency thresholds"""
        total_sites = len(self.data)
        
        for pattern_type, patterns in pattern_data.items():
            if isinstance(patterns, dict):
                for pattern, count in patterns.items():
                    if isinstance(count, int):
                        self._categorize_single_pattern(pattern_type, pattern, count, min_sites)
            elif isinstance(patterns, int):
                self._categorize_single_pattern(pattern_type, pattern_type, patterns, min_sites)
    
    def _categorize_single_pattern(self, pattern_type: str, pattern: str, count: int, min_sites: Dict):
        """Helper to categorize a single pattern"""
        if count >= min_sites['common_min_sites']:
            if pattern_type not in self.patterns['common']:
                self.patterns['common'][pattern_type] = {}
            self.patterns['common'][pattern_type][pattern] = count
        elif count >= min_sites['likely_min_sites']:
            if pattern_type not in self.patterns['likely']:
                self.patterns['likely'][pattern_type] = {}
            self.patterns['likely'][pattern_type][pattern] = count
        elif count >= min_sites['nice_to_have_min_sites']:
            if pattern_type not in self.patterns['nice_to_have']:
                self.patterns['nice_to_have'][pattern_type] = {}
            self.patterns['nice_to_have'][pattern_type][pattern] = count
    
    def generate_design_tokens(self) -> Dict:
        """Generate design tokens based on common patterns"""
        
        # Analyze common colors and derive palette
        color_analysis = self.analyze_color_patterns()
        
        # Default professional mortgage industry palette
        base_palette = {
            'primary': '#1a365d',      # Professional blue
            'secondary': '#2d3748',    # Dark gray
            'accent': '#3182ce',       # Bright blue
            'success': '#38a169',      # Green
            'neutral_50': '#f7fafc',
            'neutral_100': '#edf2f7',
            'neutral_200': '#e2e8f0',
            'neutral_300': '#cbd5e0',
            'neutral_400': '#a0aec0',
            'neutral_500': '#718096',
            'neutral_600': '#4a5568',
            'neutral_700': '#2d3748',
            'neutral_800': '#1a202c',
            'neutral_900': '#171923'
        }
        
        return {
            'colors': base_palette,
            'typography': {
                'font_stack': 'system-ui, -apple-system, Inter, "SF Pro Text", Roboto, Arial, sans-serif',
                'scale_px': [12, 14, 16, 18, 20, 24, 28, 34, 44, 56]
            },
            'spacing': [4, 8, 12, 16, 24, 32, 48, 64],
            'border_radius': [6, 12, 20],
            'shadows': {
                'sm': '0 2px 20px rgba(0,0,0,0.06)',
                'md': '0 8px 30px rgba(0,0,0,0.12)'
            }
        }
    
    def analyze_all(self) -> Dict:
        """Run all analyses and return complete pattern analysis"""
        logger.info("Starting pattern analysis...")
        
        analysis_results = {
            'navigation': self.analyze_navigation_patterns(),
            'hero_patterns': self.analyze_hero_patterns(),
            'loan_products': self.analyze_loan_products(),
            'colors': self.analyze_color_patterns(),
            'copy_patterns': self.analyze_copy_patterns(),
            'compliance': self.analyze_compliance_patterns()
        }
        
        # Categorize by frequency
        min_sites = {
            'common_min_sites': 6,
            'likely_min_sites': 4,
            'nice_to_have_min_sites': 2
        }
        
        self.categorize_patterns(analysis_results, min_sites)
        
        # Generate design tokens
        design_tokens = self.generate_design_tokens()
        
        return {
            'raw_analysis': analysis_results,
            'categorized_patterns': self.patterns,
            'design_tokens': design_tokens,
            'summary': {
                'total_sites_analyzed': len(self.data),
                'common_components': len(self.patterns.get('common', {})),
                'likely_components': len(self.patterns.get('likely', {}))
            }
        }

def main():
    # Load crawled data
    try:
        with open('/Volumes/Scrape Live/data/sample_crawled_data.json', 'r') as f:
            crawled_data = json.load(f)
    except FileNotFoundError:
        logger.error("Crawled data not found. Run crawler.py first.")
        return
    
    analyzer = PatternAnalyzer(crawled_data)
    analysis_results = analyzer.analyze_all()
    
    # Save analysis results
    with open('/Volumes/Scrape Live/data/pattern_analysis.json', 'w') as f:
        json.dump(analysis_results, f, indent=2)
    
    logger.info("Pattern analysis complete.")
    logger.info(f"Analyzed {analysis_results['summary']['total_sites_analyzed']} sites")
    logger.info(f"Found {analysis_results['summary']['common_components']} common patterns")
    
    return analysis_results

if __name__ == "__main__":
    main()