<?php
/**
 * HTML Parser
 *
 * Orchestrates HTML normalization and pattern detection.
 * Reads cached HTML files, normalizes content, detects patterns,
 * and extracts metadata.
 *
 * @package STCWHeadlessAssistant
 * @since 2.0.0
 */

namespace STCW\Headless\Engine;

use STCW\Headless\Engine\Detector\PatternDetector;

if (!defined('ABSPATH')) exit;

class Parser {
    
    /**
     * Parse an HTML file and extract patterns
     * 
     * @param string $file_path Path to HTML file
     * @return array Parsed data including patterns and metadata
     */
    public function parse_file($file_path) {
        if (!file_exists($file_path)) {
            return [
                'success' => false,
                'message' => 'File not found',
                'patterns' => [],
            ];
        }
        
        // Read file
        // phpcs:ignore WordPress.WP.AlternativeFunctions.file_get_contents_file_get_contents -- Reading local files in CLI context
        $html = file_get_contents($file_path);
        
        if ($html === false || empty($html)) {
            return [
                'success' => false,
                'message' => 'Could not read file',
                'patterns' => [],
            ];
        }
        
        // 1. NORMALIZE HTML
        $normalizer = new Normalizer();
        $normalized_html = $normalizer->normalize($html);
        
        // 2. DETECT PATTERNS
        $detector = new PatternDetector();
        $detected_patterns = $detector->detect($normalized_html);
        
        // 3. EXTRACT METADATA (from original HTML)
        $metadata = $this->extract_metadata($html);
        
        return [
            'success' => true,
            'message' => sprintf('Found %d patterns', count($detected_patterns)),
            'metadata' => $metadata,
            'patterns' => $detected_patterns,
            'normalized_html' => $normalized_html,
            'normalization_stats' => $normalizer->get_stats(),
            'detection_stats' => $detector->get_stats(),
        ];
    }

    /**
     * Extract metadata from HTML (title, description, source envelope, etc.)
     *
     * Enhanced to extract source identity envelope for deterministic imports.
     * Prioritizes STCW_META comment for authoritative WordPress data.
     * Falls back to HTML meta tags for backward compatibility.
     *
     * @since 2.1.0
     * @since 2.2.0 Added STCW_META extraction
     * @param string $html HTML content
     * @return array Metadata including source envelope
     */
    private function extract_metadata($html) {
        $metadata = [
            'title' => '',
            'description' => '',
            'og_image' => '',
            'canonical' => '',
            'url' => '',
            'slug' => '',

            // Source envelope for identity and change detection
            'source' => [
                'type' => 'wordpress',
                'permalink' => '',
                'wp_post_id' => null,
                'wp_post_type' => '',
                'og_type' => '',
                'published' => '',
                'modified' => '',
                'cached' => '',
                'scw_version' => '',
            ],
        ];

        // PRIORITY 1: Extract STCW_META comment (authoritative WordPress data)
        // Format: <!-- STCW_META:{"wp_post_id":7,"wp_post_type":"page",...} -->
        if (preg_match('/<!--\s*STCW_META:(.*?)\s*-->/s', $html, $matches)) {
            $stcw_data = json_decode(trim($matches[1]), true);

            if ($stcw_data && is_array($stcw_data)) {
                // Extract authoritative WordPress data
                if (isset($stcw_data['wp_post_id'])) {
                    $metadata['source']['wp_post_id'] = (int) $stcw_data['wp_post_id'];
                }

                if (isset($stcw_data['wp_post_type'])) {
                    $metadata['source']['wp_post_type'] = $stcw_data['wp_post_type'];
                }

                if (isset($stcw_data['permalink'])) {
                    $permalink = $stcw_data['permalink'];
                    $metadata['canonical'] = $permalink;
                    $metadata['url'] = $permalink;
                    $metadata['source']['permalink'] = $permalink;

                    // Extract slug from permalink
                    $parsed = wp_parse_url($permalink);
                    if (isset($parsed['path'])) {
                        $path = trim($parsed['path'], '/');
                        $parts = explode('/', $path);
                        $metadata['slug'] = end($parts) ?: 'home';
                    }
                }

                if (isset($stcw_data['post_date'])) {
                    $metadata['source']['published'] = $stcw_data['post_date'];
                }

                if (isset($stcw_data['post_modified'])) {
                    $metadata['source']['modified'] = $stcw_data['post_modified'];
                }

                if (isset($stcw_data['template'])) {
                    $metadata['source']['template'] = $stcw_data['template'];
                }

                if (isset($stcw_data['scw_version'])) {
                    $metadata['source']['scw_version'] = $stcw_data['scw_version'];
                }

                if (isset($stcw_data['cached_at'])) {
                    $metadata['source']['cached'] = $stcw_data['cached_at'];
                }
            }
        }

        // Extract title
        if (preg_match('/<title>(.*?)<\/title>/is', $html, $matches)) {
            $metadata['title'] = trim($matches[1]);
        }

        // Extract meta description
        if (preg_match('/<meta\s+name=["\']description["\']\s+content=["\'](.*?)["\']/is', $html, $matches)) {
            $metadata['description'] = trim($matches[1]);
        }

        // Extract OG image
        if (preg_match('/<meta\s+property=["\']og:image["\']\s+content=["\'](.*?)["\']/is', $html, $matches)) {
            $metadata['og_image'] = trim($matches[1]);
        }

        // FALLBACK: Extract canonical URL if not from STCW_META
        if (empty($metadata['canonical']) && preg_match('/<link\s+rel=["\']canonical["\']\s+href=["\'](.*?)["\']/is', $html, $matches)) {
            $canonical = trim($matches[1]);
            $metadata['canonical'] = $canonical;
            $metadata['url'] = $canonical;
            $metadata['source']['permalink'] = $canonical;

            // Extract slug from permalink
            $parsed = wp_parse_url($canonical);
            if (isset($parsed['path'])) {
                $path = trim($parsed['path'], '/');
                $parts = explode('/', $path);
                $metadata['slug'] = end($parts) ?: 'home';
            }
        }

        // FALLBACK: Extract OG URL if canonical still not found
        if (empty($metadata['canonical']) && preg_match('/<meta\s+property=["\']og:url["\']\s+content=["\'](.*?)["\']/is', $html, $matches)) {
            $og_url = trim($matches[1]);
            $metadata['canonical'] = $og_url;
            $metadata['url'] = $og_url;
            $metadata['source']['permalink'] = $og_url;

            // Extract slug from OG URL
            $parsed = wp_parse_url($og_url);
            if (isset($parsed['path'])) {
                $path = trim($parsed['path'], '/');
                $parts = explode('/', $path);
                $metadata['slug'] = end($parts) ?: 'home';
            }
        }

        // Extract OG type (for context, even if we have wp_post_type)
        if (preg_match('/<meta\s+property=["\']og:type["\']\s+content=["\'](.*?)["\']/is', $html, $matches)) {
            $og_type = trim($matches[1]);
            $metadata['source']['og_type'] = $og_type;

            // Only use as wp_post_type if we don't have authoritative data from STCW_META
            if (empty($metadata['source']['wp_post_type'])) {
                $metadata['source']['wp_post_type'] = $og_type;
            }
        }

        // FALLBACK: Extract published time from meta tags if not from STCW_META
        if (empty($metadata['source']['published']) && preg_match('/<meta\s+property=["\']article:published_time["\']\s+content=["\'](.*?)["\']/is', $html, $matches)) {
            $metadata['source']['published'] = trim($matches[1]);
        }

        // FALLBACK: Extract modified time from meta tags if not from STCW_META
        if (empty($metadata['source']['modified']) && preg_match('/<meta\s+property=["\']article:modified_time["\']\s+content=["\'](.*?)["\']/is', $html, $matches)) {
            $metadata['source']['modified'] = trim($matches[1]);
        }

        // FALLBACK: Extract StaticCacheWrangler timestamp if not from STCW_META
        // Format: <!-- StaticCacheWrangler: generated=2025-12-29T01:00:56+00:00; plugin=2.1.4 -->
        if (empty($metadata['source']['cached']) && preg_match('/<!--\s*StaticCacheWrangler:\s*generated=([\d\-T:+]+);\s*plugin=([\d\.]+)\s*-->/i', $html, $matches)) {
            $metadata['source']['cached'] = trim($matches[1]);
            $metadata['source']['scw_version'] = trim($matches[2]);
        }

        return $metadata;
    }

    /**
     * Extract main content area from HTML
     * 
     * Looks for common WordPress content containers
     * 
     * @param string $html HTML content
     * @return string Main content HTML
     */
    private function extract_content_area($html) {
        // Try to find main content area
        // Common WordPress selectors in priority order
        $selectors = [
            'article .entry-content',
            '.entry-content',
            'main .content',
            'main',
            '.post-content',
            '#content',
        ];
        
        // Use DOMDocument for reliable extraction
        $dom = new \DOMDocument();
        libxml_use_internal_errors(true);
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
        libxml_clear_errors();
        
        $xpath = new \DOMXPath($dom);
        
        foreach ($selectors as $selector) {
            // Convert CSS selector to XPath (basic conversion)
            $xpath_query = $this->css_to_xpath($selector);
            $nodes = $xpath->query($xpath_query);
            
            if ($nodes->length > 0) {
                $content_node = $nodes->item(0);
                $content = $dom->saveHTML($content_node);
                return $content;
            }
        }
        
        // Fallback: return body content
        $body = $xpath->query('//body')->item(0);
        if ($body) {
            return $dom->saveHTML($body);
        }
        
        return $html;
    }
    
    /**
     * Convert CSS selector to XPath (basic implementation)
     * 
     * @param string $css CSS selector
     * @return string XPath query
     */
    private function css_to_xpath($css) {
        // Handle element.class pattern
        if (preg_match('/^([a-z]+)\.([a-z-]+)$/i', $css, $matches)) {
            return "//{$matches[1]}[contains(@class, '{$matches[2]}')]";
        }
        
        // Handle .class pattern
        if (preg_match('/^\.([a-z-]+)$/i', $css, $matches)) {
            return "//*[contains(@class, '{$matches[1]}')]";
        }
        
        // Handle #id pattern
        if (preg_match('/^#([a-z-]+)$/i', $css, $matches)) {
            return "//*[@id='{$matches[1]}']";
        }
        
        // Handle element pattern
        if (preg_match('/^[a-z]+$/i', $css)) {
            return "//{$css}";
        }
        
        // Handle nested selectors (space-separated)
        if (strpos($css, ' ') !== false) {
            $parts = explode(' ', $css);
            $xpath_parts = array_map([$this, 'css_to_xpath'], $parts);
            return implode('', $xpath_parts);
        }
        
        return "//{$css}";
    }
}

