<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Content_Extractors;

use WP_Post;

/**
 * Content Extractor for WordPress posts.
 *
 * Extracts and normalizes post content according to strict rules:
 * - Keeps headings (h1-h6)
 * - Keeps lists
 * - Removes scripts, styles, shortcodes, comments
 * - Preserves reading order
 *
 * @since 1.0.0
 */
class Content_Extractor {

	/**
	 * Extract and normalize content from a WordPress post.
	 *
	 * @param string $raw_content The WordPress post object.
	 * @return string Normalized HTML content.
	 * @since 1.0.0
	 */
	public function extract( $raw_content ): string {
		$content = apply_filters( 'the_content', $raw_content );
		
		if ( empty( $content ) ) {
			return '';
		}

		// Normalize the content
		return $this->normalize( $content );
	}

	/**
	 * Normalize HTML content by removing unwanted elements and preserving structure.
	 *
	 * @param string $html Raw HTML content.
	 * @return string Normalized HTML content.
	 * @since 1.0.0
	 */
	public function normalize( string $html ): string {
		// Remove scripts
		$html = preg_replace( '/<script[^>]*>.*?<\/script>/is', '', $html );
		
		// Remove styles
		$html = preg_replace( '/<style[^>]*>.*?<\/style>/is', '', $html );
		
		// Remove noscript
		$html = preg_replace( '/<noscript[^>]*>.*?<\/noscript>/is', '', $html );
		
		// Remove comments
		$html = preg_replace( '/<!--.*?-->/s', '', $html );
		
		// Convert images to markdown format (preserve for chunker processing)
		// Handle figure tags with images and captions
		$html = preg_replace_callback(
			'/<figure[^>]*>.*?<\/figure>/is',
			function( $matches ) {
				$figure_content = $matches[0];
				
				// Extract image src (including lazy loading)
				$src = '';
				if ( preg_match( '/src=["\']([^"\']+)["\']/i', $figure_content, $src_match ) ) {
					$src = $src_match[1];
				} elseif ( preg_match( '/data-src=["\']([^"\']+)["\']/i', $figure_content, $data_src_match ) ) {
					$src = $data_src_match[1];
				} elseif ( preg_match( '/data-lazy-src=["\']([^"\']+)["\']/i', $figure_content, $lazy_src_match ) ) {
					$src = $lazy_src_match[1];
				}
				
				// Extract alt text
				$alt = '';
				if ( preg_match( '/alt=["\']([^"\']+)["\']/i', $figure_content, $alt_match ) ) {
					$alt = trim( $alt_match[1] );
				}
				
				// Extract figcaption as fallback
				if ( empty( $alt ) && preg_match( '/<figcaption[^>]*>(.*?)<\/figcaption>/is', $figure_content, $caption_match ) ) {
					$alt = trim( wp_strip_all_tags( $caption_match[1] ) );
				}
				
				// Build markdown image (wrapped in paragraph to preserve structure)
				if ( ! empty( $src ) && ! empty( $alt ) ) {
					return '<p>![' . $alt . '](' . $src . ')</p>';
				} elseif ( ! empty( $src ) ) {
					return '<p>![](' . $src . ')</p>';
				} elseif ( ! empty( $alt ) ) {
					return '<p>![' . $alt . ']</p>';
				}
				
				return '';
			},
			$html
		);
		
		// Convert any remaining standalone img tags to markdown
		$html = preg_replace_callback(
			'/<img([^>]*)>/i',
			function( $matches ) {
				$attrs = $matches[1];
				
				// Extract src (including lazy loading)
				$src = '';
				if ( preg_match( '/src=["\']([^"\']+)["\']/i', $attrs, $src_match ) ) {
					$src = $src_match[1];
				} elseif ( preg_match( '/data-src=["\']([^"\']+)["\']/i', $attrs, $data_src_match ) ) {
					$src = $data_src_match[1];
				} elseif ( preg_match( '/data-lazy-src=["\']([^"\']+)["\']/i', $attrs, $lazy_src_match ) ) {
					$src = $lazy_src_match[1];
				}
				
				// Extract alt text
				$alt = '';
				if ( preg_match( '/alt=["\']([^"\']+)["\']/i', $attrs, $alt_match ) ) {
					$alt = trim( $alt_match[1] );
				}
				
				// Build markdown image
				if ( ! empty( $src ) && ! empty( $alt ) ) {
					return '![' . $alt . '](' . $src . ')';
				} elseif ( ! empty( $src ) ) {
					return '![](' . $src . ')';
				} elseif ( ! empty( $alt ) ) {
					return '![' . $alt . ']';
				}
				
				return '';
			},
			$html
		);
		
		// Remove SVG elements completely
		$html = preg_replace( '/<svg[^>]*>.*?<\/svg>/is', '', $html );
		
		// Remove navigation elements (prev/next links, TOC, etc.)
		$html = preg_replace( '/<nav[^>]*>.*?<\/nav>/is', '', $html );
		$html = preg_replace( '/<div[^>]*(?:class|id)=["\'][^"\']*(?:toc|nav|prev|next|docs-prev-next|docs-toc|docs-search|mobile-menu|menu-toggle)[^"\']*["\'][^>]*>.*?<\/div>/is', '', $html );
		
		// Remove UI elements (mobile menus, search bars, buttons, etc.)
		$html = preg_replace( '/<button[^>]*>.*?<\/button>/is', '', $html );
		$html = preg_replace( '/<input[^>]*>/is', '', $html );
		$html = preg_replace( '/<div[^>]*(?:class|id)=["\'][^"\']*(?:md:hidden|hidden|search|menu|toggle|mobile)[^"\']*["\'][^>]*>.*?<\/div>/is', '', $html );
		
		// Remove header elements that are not content (page headers with just title)
		$html = preg_replace( '/<header[^>]*class=["\'][^"\']*docs-content-header[^"\']*["\'][^>]*>.*?<\/header>/is', '', $html );
		
		// Shortcodes are already removed in extract() method
		// Double-check for any remaining shortcode tags (shouldn't be any, but just in case)
		$html = strip_shortcodes( $html );
		
		// Preserve headings (h1-h6) - they are kept as-is
		// Preserve lists (ul, ol, li) - they are kept as-is
		// Preserve paragraphs, divs, spans for structure
		
		// Remove empty elements that don't contribute to content
		$html = preg_replace( '/<(div|span|p)[^>]*>\s*<\/\1>/is', '', $html );
		
		// Normalize whitespace but preserve structure
		$html = preg_replace( '/\s+/u', ' ', $html );
		$html = preg_replace( '/>\s+</', '><', $html );
		
		// Restore spacing around block elements for readability
		$html = preg_replace( '/><(h[1-6]|p|div|ul|ol|li|br)/i', '> <$1', $html );
		$html = preg_replace( '/(<\/h[1-6]|<\/p|<\/div|<\/ul|<\/ol|<\/li|<\/br)></i', '$1> <', $html );
		
		return trim( $html );
	}
}
