<?php
/**
 * HTML Normalizer
 *
 * Cleans and prepares HTML for pattern detection by removing WordPress-specific
 * attributes, inline styles, style/script tags, and unnecessary wrappers while 
 * preserving semantic structure.
 *
 * @package STCWHeadlessAssistant
 * @since 2.0.0
 */

namespace STCW\Headless\Engine;

if (!defined('ABSPATH')) exit;

class Normalizer {
    
    /**
     * Normalization statistics
     * @var array
     */
    private $stats = [];
    
    /**
     * Normalize HTML for pattern detection
     *
     * Uses regex-based approach to preserve HTML while cleaning attributes.
     * DOMDocument::saveHTML() is unreliable for preserving attributes.
     *
     * @param string $html Raw HTML content
     * @return string Normalized HTML
     */
    public function normalize($html) {
        $this->stats = [
            'attributes_removed' => 0,
            'inline_styles_removed' => 0,
            'style_tags_removed' => 0,
            'script_tags_removed' => 0,
            'php_warnings_removed' => 0,
            'empty_nodes_removed' => 0,
            'wrappers_flattened' => 0,
            'original_size' => strlen($html),
            'normalized_size' => 0,
        ];
        
        $normalized = $html;
        
        // Step 0: Remove <style> and <script> tags (CRITICAL - do this first!)
        $normalized = $this->remove_style_and_script_tags($normalized);
        
        // Step 0.5: Remove PHP warnings/errors that leaked into HTML
        $normalized = $this->remove_php_warnings($normalized);
        
        // Step 1: Remove schema.org attributes
        $normalized = $this->remove_schema_attributes($normalized);
        
        // Step 2: Remove inline styles
        $normalized = $this->remove_inline_styles_regex($normalized);
        
        // Step 3: Clean classes (minimal - only remove specific admin classes)
        $normalized = $this->clean_classes_regex($normalized);
        
        // Step 4: Remove empty tags
        $normalized = $this->remove_empty_tags($normalized);
        
        $this->stats['normalized_size'] = strlen($normalized);
        
        return $normalized;
    }
    
    /**
     * Remove <style> and <script> tags and their contents
     *
     * @param string $html HTML content
     * @return string Cleaned HTML
     */
    private function remove_style_and_script_tags($html) {
        // Count style tags before removal
        $style_count = preg_match_all('/<style[^>]*>/i', $html, $matches);
        $this->stats['style_tags_removed'] = $style_count;
        
        // Count script tags before removal
        $script_count = preg_match_all('/<script[^>]*>/i', $html, $matches);
        $this->stats['script_tags_removed'] = $script_count;
        
        // Remove <style> tags and their contents (case-insensitive, multiline, greedy)
        $html = preg_replace('/<style[^>]*>.*?<\/style>/is', '', $html);
        
        // Remove <script> tags and their contents (case-insensitive, multiline, greedy)
        $html = preg_replace('/<script[^>]*>.*?<\/script>/is', '', $html);
        
        // Remove <noscript> tags too
        $html = preg_replace('/<noscript[^>]*>.*?<\/noscript>/is', '', $html);
        
        return $html;
    }
    
    /**
     * Remove PHP warnings, errors, and notices that leaked into HTML
     *
     * @param string $html HTML content
     * @return string Cleaned HTML
     */
    private function remove_php_warnings($html) {
        // Patterns to match PHP error messages
        $patterns = [
            // Deprecated: Function ... in /path/to/file.php on line 123
            '/Deprecated:\s+Function.*?on line \d+/is',
            '/Deprecated:\s+.*?on line \d+/is',
            
            // Notice: ... in /path/to/file.php on line 123
            '/Notice:\s+.*?on line \d+/is',
            
            // Warning: ... in /path/to/file.php on line 123
            '/Warning:\s+.*?on line \d+/is',
            
            // Fatal error: ... in /path/to/file.php on line 123
            '/Fatal error:\s+.*?on line \d+/is',
            
            // Parse error: ... in /path/to/file.php on line 123
            '/Parse error:\s+.*?on line \d+/is',
            
            // Strict Standards: ... in /path/to/file.php on line 123
            '/Strict Standards:\s+.*?on line \d+/is',
        ];
        
        $warnings_removed = 0;
        
        foreach ($patterns as $pattern) {
            $count = preg_match_all($pattern, $html, $matches);
            $warnings_removed += $count;
            $html = preg_replace($pattern, '', $html);
        }
        
        $this->stats['php_warnings_removed'] = $warnings_removed;
        
        return $html;
    }
    
    /**
     * Remove schema.org attributes using regex
     *
     * @param string $html HTML content
     * @return string Cleaned HTML
     */
    private function remove_schema_attributes($html) {
        // Remove itemtype, itemprop, itemscope
        $patterns = [
            '/\s+itemtype=["\'][^"\']*["\']/i',
            '/\s+itemprop=["\'][^"\']*["\']/i',
            '/\s+itemscope(?:\s|>|\/)/i',
        ];
        
        foreach ($patterns as $pattern) {
            $before_count = substr_count($html, 'item');
            $html = preg_replace($pattern, ' ', $html);
            $after_count = substr_count($html, 'item');
            $this->stats['attributes_removed'] += ($before_count - $after_count);
        }
        
        return $html;
    }
    
    /**
     * Remove inline styles using regex
     *
     * @param string $html HTML content
     * @return string Cleaned HTML
     */
    private function remove_inline_styles_regex($html) {
        // Count style attributes before removal
        $count = preg_match_all('/\s+style=["\'][^"\']*["\']/i', $html, $matches);
        $this->stats['inline_styles_removed'] = $count;
        
        // Remove style attributes
        $html = preg_replace('/\s+style=["\'][^"\']*["\']/i', '', $html);
        
        return $html;
    }
    
    /**
     * Clean class attributes - only remove specific admin classes
     *
     * @param string $html HTML content
     * @return string Cleaned HTML
     */
    private function clean_classes_regex($html) {
        // Remove ONLY these specific admin classes
        $remove_classes = [
            'block-editor-block-list__block',
            'editor-styles-wrapper',
            'wp-block-post-template__inner-block',
        ];
        
        foreach ($remove_classes as $class_to_remove) {
            // Remove class and any adjacent spaces
            $html = preg_replace(
                '/\s*\b' . preg_quote($class_to_remove, '/') . '\b\s*/i',
                ' ',
                $html
            );
        }
        
        // Clean up any empty class attributes
        $html = preg_replace('/\s+class=["\']\s*["\']/i', '', $html);
        
        return $html;
    }
    
    /**
     * Remove empty HTML tags
     *
     * @param string $html HTML content
     * @return string Cleaned HTML
     */
    private function remove_empty_tags($html) {
        // Count before
        $before = substr_count($html, '<div');
        
        // Remove empty divs, spans, paragraphs (no content between tags)
        $patterns = [
            '/<div[^>]*>\s*<\/div>/i',
            '/<span[^>]*>\s*<\/span>/i',
            '/<p[^>]*>\s*<\/p>/i',
        ];
        
        foreach ($patterns as $pattern) {
            $html = preg_replace($pattern, '', $html);
        }
        
        // Count after
        $after = substr_count($html, '<div');
        $this->stats['empty_nodes_removed'] = $before - $after;
        
        return $html;
    }
    
    /**
     * Get normalization statistics
     *
     * @return array Statistics
     */
    public function get_stats() {
        $reduction = 0;
        
        if ($this->stats['original_size'] > 0) {
            $reduction = round(
                (($this->stats['original_size'] - $this->stats['normalized_size']) / $this->stats['original_size']) * 100,
                1
            );
        }
        
        return array_merge($this->stats, [
            'reduction_percent' => $reduction,
        ]);
    }
}
