<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Content_Extractors;

use DOMDocument;
use DOMXPath;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;

/**
 * Enhanced URL Content Extractor with main article detection.
 *
 * Uses heuristic scoring similar to Mozilla Readability to find the main content node.
 * Expanded selectors for 2025 web patterns.
 */
class URL_Content_Extractor {

	private Content_Extractor $content_extractor;

	public function __construct() {
		$this->content_extractor = new Content_Extractor();
	}

	public function extract( string $url ): array {
		if ( ! filter_var( $url, FILTER_VALIDATE_URL ) ) {
			throw new Exception(
				Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'Invalid URL format.', 'limb-chatbot' )
			);
		}
		$plugin_version = Limb_Chatbot()->get_version();
		$response = wp_remote_get( $url, [
			'timeout'     => 30,
			'redirection' => 5,
			'user-agent'  => "Mozilla/5.0 (compatible; LimbChatbot/{$plugin_version}; +https://wordpress.org/plugins/limb-chatbot/)",
			'sslverify'   => true,
			'headers'     => [
				'Accept' => 'text/html',
			],
		] );

		if ( is_wp_error( $response ) ) {
			throw new Exception(
				Error_Codes::TECHNICAL_ERROR,
				sprintf( __( 'Failed to fetch URL: %s', 'limb-chatbot' ), $response->get_error_message() )
			);
		}

		$response_code = wp_remote_retrieve_response_code( $response );
		if ( $response_code !== 200 ) {
			throw new Exception(
				Error_Codes::TECHNICAL_ERROR,
				sprintf( __( 'URL returned HTTP status %d', 'limb-chatbot' ), $response_code )
			);
		}

		$body = wp_remote_retrieve_body( $response );
		if ( empty( $body ) ) {
			throw new Exception( Error_Codes::EMPTY_VALUE, __( 'No content found in URL response', 'limb-chatbot' ) );
		}

		// Load HTML with error suppression
		libxml_use_internal_errors( true );
		$dom = new DOMDocument();
		$dom->encoding = 'UTF-8';
		@$dom->loadHTML( '<?xml encoding="UTF-8">' . $body );
		libxml_clear_errors();

		$xpath = new DOMXPath( $dom );

		// Extract title
		$title = '';
		$title_nodes = $xpath->query( '//title' );
		if ( $title_nodes && $title_nodes->length > 0 ) {
			$title = trim( $title_nodes->item( 0 )->textContent );
			$title = html_entity_decode( $title, ENT_QUOTES | ENT_HTML5, 'UTF-8' );
		}

		// === MAIN CONTENT EXTRACTION (Readability-style) ===
		$main_content_html = $this->extract_main_content( $dom, $xpath );

		if ( empty( trim( strip_tags( $main_content_html ) ) ) ) {
			throw new Exception( Error_Codes::EMPTY_VALUE, __( 'No extractable main content found', 'limb-chatbot' ) );
		}

		// Normalize to clean HTML → Markdown-like structure
		$normalized_content = $this->content_extractor->normalize( $main_content_html );

		return [
			'title'   => $title ?: parse_url( $url, PHP_URL_HOST ),
			'content' => $normalized_content,
		];
	}

	/**
	 * Extract the main article content using heuristic scoring.
	 */
	private function extract_main_content( DOMDocument $dom, DOMXPath $xpath ): string {
		// Remove obvious noise first (global)
		$this->remove_noise_elements( $xpath );

		// Expanded candidate nodes (covers WordPress, modern sites, docs, etc.)
		$candidates = $xpath->query(
			'//article | ' .
			'//main | ' .
			'//div[@role="main"] | ' .
			'//section[contains(@class, "content") or contains(@class, "main")] | ' .
			'//div[contains(@class, "content")] | ' .
			'//div[contains(@id, "content")] | ' .
			'//div[contains(@class, "post")] | ' .
			'//div[contains(@class, "article")] | ' .
			'//div[contains(@class, "entry")] | ' .
			'//div[contains(@class, "entry-content")] | ' .   // Very common in WP
			'//div[contains(@class, "post-content")] | ' .
			'//div[contains(@class, "blog-post")] | ' .
			'//div[contains(@class, "story")] | ' .
			'//div[contains(@class, "text")] | ' .
			'//div[contains(@class, "body")] | ' .
			'//div[contains(@class, "markdown")] | ' .
			'//section[contains(@class, "markdown")]'
		);

		$best_node = null;
		$best_score = 0;

		if ( $candidates->length > 0 ) {
			foreach ( $candidates as $node ) {
				$score = $this->score_node( $node, $xpath );
				if ( $score > $best_score ) {
					$best_score = $score;
					$best_node = $node;
				}
			}
		}

		// Fallback: score all divs and sections with significant text
		if ( ! $best_node || $best_score < 100 ) {
			$all_containers = $xpath->query( '//div | //section' );
			foreach ( $all_containers as $container ) {
				$score = $this->score_node( $container, $xpath );
				if ( $score > $best_score && $score >= 100 ) {
					$best_score = $score;
					$best_node = $container;
				}
			}
		}

		if ( ! $best_node ) {
			// Last resort: use body
			$body_nodes = $xpath->query( '//body' );
			return $body_nodes->length ? $dom->saveHTML( $body_nodes->item( 0 ) ) : '';
		}

		// Final clean inside the selected node
		$this->clean_node( $best_node, $xpath );

		return $dom->saveHTML( $best_node );
	}

	private function remove_noise_elements( DOMXPath $xpath ): void {
		$noise_selectors = [
			'//script',
			'//style',
			'//noscript',
			'//header',
			'//footer',
			'//nav',
			'//aside',
			'//div[contains(@class, "sidebar")]',
			'//div[contains(@id, "sidebar")]',
			'//div[contains(@class, "advert")]',
			'//div[contains(@class, "ad")]',
			'//div[contains(@class, "ads")]',
			'//div[contains(@class, "advertisement")]',
			'//div[contains(@class, "cookie")]',
			'//div[contains(@class, "banner")]',
			'//div[contains(@class, "popup")]',
			'//div[contains(@class, "modal")]',
			'//div[contains(@class, "newsletter")]',
			'//div[contains(@class, "subscribe")]',
			'//div[contains(@class, "related")]',
			'//div[contains(@class, "recommended")]',
			'//div[contains(@class, "suggested")]',
			'//div[contains(@class, "author-box")]',
			'//div[contains(@class, "bio")]',
			'//div[contains(@id, "comments")]',
			'//section[contains(@class, "comments")]',
			'//ol[contains(@class, "breadcrumb")]',
			'//div[contains(@class, "tags")]',
			'//div[contains(@class, "categories")]',
			'//form',
			'//iframe',
		];

		foreach ( $noise_selectors as $selector ) {
			$nodes = $xpath->query( $selector );
			foreach ( $nodes as $node ) {
				if ( $node->parentNode ) {
					$node->parentNode->removeChild( $node );
				}
			}
		}
	}

	private function score_node( \DOMNode $node, DOMXPath $xpath ): int {
		$text = $node->textContent;
		$text_len = mb_strlen( trim( $text ) );

		if ( $text_len < 150 ) {
			return 0;
		}

		$score = $text_len;

		// Expanded good/bad patterns
		$class = strtolower( $node->getAttribute( 'class' ) );
		$id    = strtolower( $node->getAttribute( 'id' ) );

		$good_patterns = [
			'content', 'article', 'post', 'entry', 'main', 'body',
			'story', 'text', 'markdown', 'blog', 'page'
		];
		$bad_patterns = [
			'nav', 'header', 'footer', 'sidebar', 'menu', 'ad', 'ads',
			'advert', 'cookie', 'banner', 'popup', 'modal', 'share',
			'social', 'comment', 'related', 'recommended', 'subscribe',
			'newsletter', 'author', 'bio', 'tag', 'category', 'breadcrumb'
		];

		foreach ( $good_patterns as $pat ) {
			if ( strpos( $class, $pat ) !== false || strpos( $id, $pat ) !== false ) {
				$score += 75;
			}
		}

		foreach ( $bad_patterns as $pat ) {
			if ( strpos( $class, $pat ) !== false || strpos( $id, $pat ) !== false ) {
				$score -= 150;
			}
		}

		// Penalize high link density (menus, sidebars)
		$links = $xpath->query( './/a', $node );
		$link_text_len = 0;
		foreach ( $links as $link ) {
			$link_text_len += mb_strlen( trim( $link->textContent ) );
		}

		if ( $link_text_len > 0 ) {
			$link_density = $link_text_len / $text_len;
			if ( $link_density > 0.4 ) {
				$score -= (int)( $text_len * 0.7 );
			} elseif ( $link_density > 0.3 ) {
				$score -= (int)( $text_len * 0.4 );
			}
		}

		return max( 0, $score );
	}

	private function clean_node( \DOMNode $node, DOMXPath $xpath ): void {
		$remove_selectors = [
			'.//script',
			'.//style',
			'.//button',
			'.//input',
			'.//form',
			'.//nav',
			'.//aside',
			'.//footer',
			'.//div[contains(@class, "share") or contains(@class, "social")]',
			'.//div[contains(@class, "author") or contains(@class, "bio")]',
			'.//div[contains(@class, "related") or contains(@class, "recommended")]',
			'.//div[contains(@class, "ad") or contains(@class, "ads")]',
		];

		foreach ( $remove_selectors as $sel ) {
			$nodes = $xpath->query( $sel, $node );
			foreach ( $nodes as $n ) {
				if ( $n->parentNode ) {
					$n->parentNode->removeChild( $n );
				}
			}
		}
	}
}