<?php
if (!defined('ABSPATH')) {
    exit;
}

class Voxfor_Content_Extractor {
    
    public static function extract_post_content($post_id) {
        $post = get_post($post_id);
        
        if (!$post || $post->post_status !== 'publish') {
            return null;
        }
        
        $content_parts = [];
        
        $title = trim(wp_strip_all_tags($post->post_title));
        if (!empty($title)) {
            $content_parts[] = "Title: " . $title;
        }
        
        $raw_content = $post->post_content;
        $clean_content = self::clean_content($raw_content);
        
        if (!empty($clean_content)) {
            $content_parts[] = $clean_content;
        }
        
        if (empty($content_parts)) {
            return null;
        }
        
        return implode("\n\n", $content_parts);
    }
    
    private static function clean_content($content) {
        $content = do_blocks($content);
        
        $content = preg_replace('/\[.*?\]/', '', $content);
        
        $content = wp_kses($content, [
            'p' => [],
            'br' => [],
            'h1' => [],
            'h2' => [],
            'h3' => [],
            'h4' => [],
            'h5' => [],
            'h6' => [],
            'ul' => [],
            'ol' => [],
            'li' => [],
            'blockquote' => [],
            'strong' => [],
            'b' => [],
            'em' => [],
            'i' => [],
        ]);
        
        libxml_use_internal_errors(true);
        $dom = new DOMDocument();
        $dom->loadHTML('<?xml encoding="UTF-8"><html><body>' . $content . '</body></html>', LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
        libxml_clear_errors();
        
        $body = $dom->getElementsByTagName('body')->item(0);
        
        $cleaned_parts = [];
        if ($body) {
            self::extract_text_recursively($body, $cleaned_parts);
        }
        
        $text = implode("\n\n", $cleaned_parts);
        $text = preg_replace('/\n{3,}/', "\n\n", $text);
        $text = trim($text);
        
        return $text;
    }
    
    private static function extract_text_recursively($node, &$parts) {
        if ($node === null) {
            return;
        }
        
        if ($node->nodeType === XML_ELEMENT_NODE) {
            $tag_name = strtolower($node->nodeName);
            
            switch ($tag_name) {
                case 'h1':
                case 'h2':
                case 'h3':
                case 'h4':
                case 'h5':
                case 'h6':
                    $text = trim($node->textContent);
                    if (!empty($text)) {
                        $parts[] = "Heading: " . $text;
                    }
                    break;
                
                case 'p':
                    $text = trim($node->textContent);
                    if (!empty($text)) {
                        $parts[] = $text;
                    }
                    break;
                
                case 'blockquote':
                    $text = trim($node->textContent);
                    if (!empty($text)) {
                        $parts[] = "Quote: " . $text;
                    }
                    break;
                
                case 'ul':
                case 'ol':
                    $list_items = [];
                    foreach ($node->childNodes as $child) {
                        if ($child->nodeName === 'li') {
                            $item_text = trim($child->textContent);
                            if (!empty($item_text)) {
                                $list_items[] = "- " . $item_text;
                            }
                        }
                    }
                    if (!empty($list_items)) {
                        $parts[] = implode("\n", $list_items);
                    }
                    break;
                
                default:
                    if ($node->hasChildNodes()) {
                        foreach ($node->childNodes as $child) {
                            self::extract_text_recursively($child, $parts);
                        }
                    }
                    break;
            }
        }
    }
    
    public static function validate_content($content) {
        if (empty($content)) {
            return [
                'valid' => false,
                'message' => __('No content available to summarize.', 'voxfor-ai-content-summary')
            ];
        }
        
        $word_count = self::count_words_unicode($content);
        
        if ($word_count < 50) {
            return [
                'valid' => false,
                'message' => __('Content is too short for a quality summary. Minimum 50 words required.', 'voxfor-ai-content-summary')
            ];
        }
        
        return [
            'valid' => true,
            'word_count' => $word_count
        ];
    }
    
    private static function count_words_unicode($text) {
        $text = trim(preg_replace('/\s+/', ' ', $text));
        
        if (empty($text)) {
            return 0;
        }
        
        preg_match_all('/[\p{L}\p{N}]+/u', $text, $matches);
        
        return count($matches[0]);
    }
}
