<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Generators;

use Limb_Chatbot\Includes\Data_Objects\Dataset;
use Limb_Chatbot\Includes\Data_Objects\Dataset_Entry;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Factories\Knowledge_Mapper_Factory;
use Limb_Chatbot\Includes\Interfaces\Knowledge_Generator_Interface;
use Limb_Chatbot\Includes\Services\Collection;
use Limb_Chatbot\Includes\Services\Helper;
use Limb_Chatbot\Includes\Services\Knowledge\Builders\Chunk_Entry_Builder;
use Limb_Chatbot\Includes\Services\Knowledge\Chunkers\Heading_Aware_Chunker;
use Limb_Chatbot\Includes\Services\Knowledge\Content_Extractors\Content_Extractor;
use WP_Post;

/**
 * Post Knowledge Generator using chunk-based approach.
 *
 * Generates knowledge entries from WordPress posts using:
 * 1. Content extraction and normalization
 * 2. Heading-aware chunking (400-600 tokens, 10-15% overlap)
 * 3. Chunk-based dataset entries
 *
 * @since 1.0.0
 */
class Post_Knowledge_Generator implements Knowledge_Generator_Interface {

	/**
	 * Content extractor for normalizing post content.
	 *
	 * @var Content_Extractor
	 */
	protected Content_Extractor $content_extractor;

	/**
	 * Heading-aware chunker for segmenting content.
	 *
	 * @var Heading_Aware_Chunker
	 */
	protected Heading_Aware_Chunker $chunker;

	/**
	 * Chunk entry builder for creating dataset entries.
	 *
	 * @var Chunk_Entry_Builder
	 */
	protected Chunk_Entry_Builder $entry_builder;

	/**
	 * Constructor to initialize dependencies.
	 *
	 * @since 1.0.0
	 */
	public function __construct() {
		$this->content_extractor = new Content_Extractor();
		$this->chunker           = new Heading_Aware_Chunker();
		$this->entry_builder     = new Chunk_Entry_Builder();
	}

	/**
	 * Generate knowledge entries from a dataset source.
	 *
	 * @param  Dataset  $dataset  Dataset to generate entries for.
	 *
	 * @return Dataset Updated dataset with generated entries.
	 * @throws Exception If generation fails.
	 * @since 1.0.0
	 */
	public function generate( Dataset $dataset ): Dataset {
		// Resolve source object
		$source = Helper::resolve_source_object( $dataset->get_source_type(), $dataset->get_source() );

		if ( ! $source instanceof WP_Post ) {
			throw new Exception( Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'Source must be a WordPress post', 'limb-chatbot' ) );
		}

		// Step 1: Extract and normalize content
		// Remove shortcodes first (before applying filters)
		// This ensures shortcode tags are removed, not executed
		$raw_content = strip_shortcodes( $source->post_content );

		// Get the post content with WordPress filters applied (but without shortcodes)
		// We apply the_content filter to get proper formatting, autop, etc.
		$normalized_content = $this->content_extractor->extract( $raw_content );

		if ( empty( $normalized_content ) ) {
			$mapper    = ( new Knowledge_Mapper_Factory() )->make( $dataset->get_source_type() );
			$exception = new Exception( Error_Codes::EMPTY_VALUE,
				__( 'No content found in ', 'limb-chatbot' ),
				[ 'warning' => true ] );
			$exception->attach_link( get_permalink( $source->ID ), $mapper->get_entry_input( $source ) );
			throw $exception;
		}

		// Update the dataset meta to enable edition in future
		// Clean HTML to only keep allowed tags without attributes
		$cleaned_content = $this->clean_html_for_source_content( $normalized_content );
		$dataset->update_meta( 'source_content', $cleaned_content );

		// Step 2: Chunk content with heading awareness
		$title  = $dataset->get_name();
		$chunks = $this->chunker->chunk( $normalized_content, $title );

		if ( empty( $chunks ) ) {
			throw new Exception( Error_Codes::EMPTY_VALUE,
				__( 'Failed to chunk content into segments', 'limb-chatbot' ) );
		}

		// Step 3: Process chunks into dataset entries
		$this->process_chunks( $dataset, $chunks, $source, $normalized_content, $title );

		return $dataset;
	}

	/**
	 * Clean HTML content for source_content meta.
	 *
	 * Removes all attributes, styles, classes, and non-allowed tags.
	 * Keeps only clean textual HTML from WordPress WYSIWYG-supported elements.
	 *
	 * @param  string  $html  Raw HTML content.
	 * @return string Clean HTML with only allowed tags and no attributes.
	 * @since 1.0.0
	 */
	protected function clean_html_for_source_content( string $html ): string {
		// First, remove div tags but preserve their inner content
		// Handle nested divs by using a proper matching algorithm
		$html = $this->remove_div_tags_preserve_content( $html );

		// Define allowed tags for WordPress WYSIWYG editor
		$allowed_tags = [
			'a'      => [ 'href' => [] ],
			'p'      => [],
			'h1'     => [],
			'h2'     => [],
			'h3'     => [],
			'h4'     => [],
			'h5'     => [],
			'h6'     => [],
			'span'   => [],
			'strong' => [],
			'b'      => [],
			'em'     => [],
			'i'      => [],
			'img'    => [ 'src' => [], 'alt' => [] ],
			'table'  => [],
			'thead'  => [],
			'tbody'  => [],
			'tr'     => [],
			'td'     => [],
			'th'     => [],
			'ul'     => [],
			'ol'     => [],
			'li'     => [],
			'br'     => [],
		];

		// Use wp_kses to strip all attributes except those specified
		// IMPORTANT: wp_kses preserves ALL text content even when removing disallowed tags
		// It only removes the tags themselves, never the text inside them
		$cleaned = wp_kses( $html, $allowed_tags );

		// Clean up any remaining empty tags
		$cleaned = preg_replace( '/<(p|span|strong|b|em|i|a)[^>]*>\s*<\/\1>/is', '', $cleaned );

		// Normalize whitespace
		$cleaned = preg_replace( '/\s+/u', ' ', $cleaned );
		$cleaned = preg_replace( '/>\s+</', '><', $cleaned );

		// Restore spacing around block elements for readability
		$cleaned = preg_replace( '/><(h[1-6]|p|ul|ol|li|table|thead|tbody|tr)/i', '> <$1', $cleaned );
		$cleaned = preg_replace( '/(<\/h[1-6]|<\/p|<\/ul|<\/ol|<\/li|<\/table|<\/thead|<\/tbody|<\/tr)></i', '$1> <', $cleaned );

		return trim( $cleaned );
	}

	/**
	 * Remove div tags while preserving their inner content.
	 *
	 * Properly handles nested divs by iteratively processing from innermost to outermost.
	 *
	 * @param  string  $html  HTML content with div tags.
	 * @return string HTML with div tags removed but content preserved.
	 * @since 1.0.0
	 */
	private function remove_div_tags_preserve_content( string $html ): string {
		// Iterative approach: process from innermost divs outward
		// This handles nested divs correctly by removing them layer by layer
		$max_iterations = 50; // Safety limit
		$iteration = 0;
		
		while ( preg_match( '/<div[^>]*>/i', $html ) && $iteration < $max_iterations ) {
			$previous_html = $html;
			
			// Match div tags that don't contain other div tags (innermost first)
			// This regex matches a div that contains no other div tags
			// The negative lookahead ensures we match innermost divs first
			$html = preg_replace_callback(
				'/<div[^>]*>((?:(?!<div[^>]*>|<\/div>).)*?)<\/div>/is',
				function( $matches ) {
					$inner_content = $matches[1];
					
					// Preserve all content, even if it's just whitespace initially
					// We'll trim later, but want to ensure nothing is lost
					$inner_content = trim( $inner_content );
					
					// If inner content is empty, return nothing
					if ( empty( $inner_content ) ) {
						return '';
					}
					
					// If inner content already starts with a block element, return as-is
					// This preserves structure for headings, paragraphs, lists, tables
					if ( preg_match( '/^<(h[1-6]|p|ul|ol|table|thead|tbody|tr|li)/i', $inner_content ) ) {
						return $inner_content;
					}
					
					// Otherwise, wrap in paragraph to preserve structure
					// This ensures text content is not lost and has proper structure
					return '<p>' . $inner_content . '</p>';
				},
				$html
			);
			
			// If no change occurred, break to avoid infinite loop
			if ( $html === $previous_html ) {
				break;
			}
			
			$iteration++;
		}
		
		// Clean up any remaining orphaned div tags (unclosed or malformed)
		// Extract and preserve text content from any remaining div tags before removing them
		// This ensures no text is lost even from malformed HTML
		$html = preg_replace_callback(
			'/<div[^>]*>(.*)/is',
			function( $matches ) {
				// Extract all text content from the div, preserving structure
				$inner_content = $matches[1];
				// Remove any remaining closing div tags from the content
				$inner_content = preg_replace( '/<\/div>/i', '', $inner_content );
				$inner_content = trim( $inner_content );
				
				// If we have content, preserve it
				if ( ! empty( $inner_content ) ) {
					// If it doesn't start with a block element, wrap in paragraph
					if ( ! preg_match( '/^<(h[1-6]|p|ul|ol|table|thead|tbody|tr|li)/i', $inner_content ) ) {
						return '<p>' . $inner_content . '</p>';
					}
					return $inner_content;
				}
				return '';
			},
			$html
		);
		
		// Remove any remaining standalone div opening/closing tags
		$html = preg_replace( '/<div[^>]*>/i', '', $html );
		$html = preg_replace( '/<\/div>/i', '', $html );
		
		return $html;
	}

	/**
	 * Process chunks into dataset entries.
	 *
	 * Creates chunk-based entries and optionally generates QAs for specific post types.
	 *
	 * @param  Dataset  $dataset  Dataset to store entries.
	 * @param  array  $chunks  Array of chunk data.
	 * @param  WP_Post  $post  WordPress post object.
	 * @param  string  $normalized_content  Full normalized HTML content.
	 * @param  string  $title  Document title.
	 *
	 * @throws Exception If processing fails.
	 * @since 1.0.0
	 */
	protected function process_chunks(
		Dataset $dataset,
		array $chunks,
		WP_Post $post,
		string $normalized_content,
		string $title
	): void {
		$all_entries = new Collection();
		$source_url = get_permalink( $post->ID );
		$source_type = $dataset->get_source_type();

		// Build chunk-based entries with comprehensive metadata
		$chunk_index = 0;
		foreach ( $chunks as $chunk ) {
			try {
				// Enrich chunk with source metadata for better context
				$chunk['post_title'] = $title;
				$chunk['source_url'] = $source_url;
				$chunk['source_type'] = $source_type;
				$chunk['chunk_index'] = $chunk_index++;
				
				$entry = $this->entry_builder->build( $chunk );
				$entry->set_dataset_id( $dataset->get_id() );
				$all_entries->push_item( $entry );
			} catch ( \Exception $e ) {
				// Log individual entry build errors but don't fail the whole process
				Helper::log( [
					'error'      => 'Failed to build chunk entry',
					'dataset_id' => $dataset->get_id(),
					'chunk_data' => $chunk,
					'exception'  => $e->getMessage()
				], __METHOD__ );
			}
		}

		// Save all entries to the dataset
		if ( ! $all_entries->is_empty() ) {
			$all_entries->each( function ( Dataset_Entry $entry ) use ( $dataset ) {
				try {
					$entry->save();
				} catch ( \Exception $e ) {
					// Log individual entry save errors but don't fail the whole process
					Helper::log( [
						'error'      => 'Failed to save dataset entry',
						'dataset_id' => $dataset->get_id(),
						'entry_data' => $entry->get_entry(),
						'exception'  => $e->getMessage()
					], __METHOD__ );
				}
			} );
		} else {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No valid entries were generated from given source', 'limb-chatbot' )
			);
		}
	}
}