#!/usr/bin/env python3
"""
Mortgage Sites Crawler
Crawls mortgage websites and extracts patterns for analysis
"""

import requests
from bs4 import BeautifulSoup
import json
import time
import re
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import concurrent.futures
from typing import Dict, List, Set
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MortgageSiteCrawler:
    def __init__(self, config):
        self.config = config
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        self.visited_urls = set()
        self.extracted_data = {}
        
    def can_fetch(self, url: str) -> bool:
        """Check robots.txt if respect_robots is enabled"""
        if not self.config['crawl']['respect_robots']:
            return True
            
        try:
            rp = RobotFileParser()
            rp.set_url(f"{urlparse(url).scheme}://{urlparse(url).netloc}/robots.txt")
            rp.read()
            return rp.can_fetch('*', url)
        except:
            return True
    
    def should_crawl_url(self, url: str, base_domain: str) -> bool:
        """Check if URL should be crawled based on patterns"""
        parsed = urlparse(url)
        
        # Stay on same domain
        if parsed.netloc and base_domain not in parsed.netloc:
            return False
            
        # Check include patterns
        include_patterns = self.config['crawl']['include_patterns']
        path = parsed.path.lower()
        
        for pattern in include_patterns:
            if pattern.endswith('?'):
                if pattern[:-1] in path:
                    return True
            elif pattern in path:
                return True
                
        return False
    
    def extract_content(self, soup: BeautifulSoup, url: str) -> Dict:
        """Extract content using configured selectors"""
        data = {'url': url, 'extracted': {}}
        selectors = self.config['extraction']['selectors']
        
        # Extract by selectors
        for key, selector_list in selectors.items():
            elements = []
            for selector in selector_list:
                try:
                    found = soup.select(selector)
                    elements.extend(found)
                except Exception as e:
                    logger.debug(f"Selector error for {selector}: {e}")
            
            if key == 'images':
                data['extracted'][key] = [
                    {'src': img.get('src'), 'alt': img.get('alt', '')}
                    for img in elements if img.get('src')
                ]
            elif key in ['headings', 'copy_blocks', 'ctas']:
                data['extracted'][key] = [elem.get_text().strip() for elem in elements if elem.get_text().strip()]
            else:
                data['extracted'][key] = [elem.get_text().strip() for elem in elements if elem.get_text().strip()]
        
        # Extract colors and styles
        data['extracted']['styles'] = self.extract_styles(soup)
        
        # Extract SEO data
        title = soup.find('title')
        data['extracted']['title'] = title.get_text().strip() if title else ''
        
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        data['extracted']['meta_description'] = meta_desc.get('content', '') if meta_desc else ''
        
        return data
    
    def extract_styles(self, soup: BeautifulSoup) -> Dict:
        """Extract color and style information"""
        styles = {
            'colors': [],
            'fonts': [],
            'button_styles': []
        }
        
        # Extract inline styles and CSS
        style_tags = soup.find_all('style')
        for style in style_tags:
            css_text = style.get_text()
            
            # Extract colors
            color_matches = re.findall(r'#[0-9a-fA-F]{3,6}', css_text)
            styles['colors'].extend(color_matches)
            
            # Extract font families
            font_matches = re.findall(r'font-family:\s*([^;]+)', css_text)
            styles['fonts'].extend(font_matches)
        
        return styles
    
    def crawl_site(self, start_url: str) -> Dict:
        """Crawl a single site"""
        domain = urlparse(start_url).netloc
        logger.info(f"Crawling {domain}")
        
        if not self.can_fetch(start_url):
            logger.warning(f"Robots.txt disallows crawling {start_url}")
            return {}
        
        to_visit = [start_url]
        site_data = {'domain': domain, 'pages': []}
        
        while to_visit and len(site_data['pages']) < 5:  # Limit pages per site
            url = to_visit.pop(0)
            
            if url in self.visited_urls:
                continue
                
            try:
                response = self.session.get(
                    url, 
                    timeout=self.config['crawl']['timeout_sec'],
                    allow_redirects=True
                )
                response.raise_for_status()
                
                if 'text/html' not in response.headers.get('content-type', ''):
                    continue
                    
                self.visited_urls.add(url)
                
                soup = BeautifulSoup(response.content, 'html.parser')
                page_data = self.extract_content(soup, url)
                site_data['pages'].append(page_data)
                
                # Find more URLs to crawl
                if len(site_data['pages']) < 3:  # Only look for more URLs if we haven't hit our limit
                    links = soup.find_all('a', href=True)
                    for link in links:
                        new_url = urljoin(url, link['href'])
                        if (new_url not in self.visited_urls and 
                            new_url not in to_visit and 
                            self.should_crawl_url(new_url, domain)):
                            to_visit.append(new_url)
                
                time.sleep(0.5)  # Be respectful
                
            except Exception as e:
                logger.error(f"Error crawling {url}: {e}")
                continue
        
        return site_data
    
    def crawl_all_sites(self) -> Dict:
        """Crawl all configured sites"""
        all_data = {}
        
        for url in self.config['sources']['start_urls']:
            try:
                site_data = self.crawl_site(url)
                if site_data:
                    all_data[site_data['domain']] = site_data
            except Exception as e:
                logger.error(f"Failed to crawl {url}: {e}")
        
        return all_data

def main():
    # Load configuration
    config = {
        "sources": {
            "start_urls": [
                "https://www.emortgagecapital.com",
                "https://www.clearmortgagecapital.com",
                "https://www.cohenfinancialgroup.com",
                "https://www.loanfactory.com",
                "https://www.sunnyhillfinancial.com",
                "https://trussfinancialgroup.com",
                "https://convoyhomeloans.com",
                "https://www.c2financial.com",
                "https://www.fullcirclehomeloans.com",
                "https://www.kredium.com",
                "https://reliancefinancial.com",
                "https://loans.sierrapacificmortgage.com"
            ]
        },
        "crawl": {
            "max_depth": 2,
            "include_patterns": [
                "/$", "/home", "/about", "/team", "/leadership",
                "/loan", "/purchase", "/refinance", "/rates",
                "/resources", "/blog", "/news", "/faq", "/calculator",
                "/contact", "/apply", "/branches", "/locations",
                "/privacy", "/terms", "/licens", "/disclosure"
            ],
            "exclude_extensions": [".pdf",".zip",".doc",".xls",".mp4",".mov",".avi",".webm"],
            "respect_robots": True,
            "timeout_sec": 20
        },
        "extraction": {
            "selectors": {
                "navigation": ["header nav", "[role='navigation']"],
                "hero": [".hero", "[class*='hero']", "main section:has(h1)"],
                "headings": ["h1", "h2", "h3"],
                "copy_blocks": ["section p", "li"],
                "ctas": ["a[href*='apply']", "a[href*='contact']", "a[href*='pre-approve']", "a[href*='get-started']", "button", ".btn"],
                "forms": ["form", "iframe[src*='apply']", "a[href*='apply']"],
                "images": ["img[alt][src]"]
            }
        }
    }
    
    crawler = MortgageSiteCrawler(config)
    data = crawler.crawl_all_sites()
    
    # Save raw data
    with open('/Volumes/Scrape Live/data/crawled_data.json', 'w') as f:
        json.dump(data, f, indent=2)
    
    logger.info(f"Crawling complete. Extracted data from {len(data)} sites.")
    
    return data

if __name__ == "__main__":
    main()