<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Generators;

use Limb_Chatbot\Includes\Data_Objects\Dataset;
use Limb_Chatbot\Includes\Data_Objects\Dataset_Entry;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Interfaces\Knowledge_Generator_Interface;
use Limb_Chatbot\Includes\Services\Collection;
use Limb_Chatbot\Includes\Services\Helper;
use Limb_Chatbot\Includes\Services\Knowledge\Knowledge_Mappers\Text_Knowledge_Mapper;

/**
 * Text Knowledge Generator using chunk-based approach.
 *
 * Generates knowledge entries from text content using:
 * 1. Content extraction and normalization
 * 2. Heading-aware chunking (same as posts)
 * 3. Chunk-based dataset entries
 *
 * @since 1.0.0
 */
class Text_Knowledge_Generator extends Post_Knowledge_Generator implements Knowledge_Generator_Interface {

	/**
	 * Generate knowledge entries from a dataset source.
	 *
	 * @param  Dataset  $dataset  Dataset to generate entries for.
	 *
	 * @return Dataset Updated dataset with generated entries.
	 * @throws Exception If generation fails.
	 * @since 1.0.0
	 */
	public function generate( Dataset $dataset ): Dataset {
		// Get text source data
		$source = Helper::resolve_source_object( $dataset->get_source_type(), $dataset->get_meta_value( 'text' ) );
		$mapper = new Text_Knowledge_Mapper();

		if ( empty( $source ) || ! is_array( $source ) ) {
			throw new Exception(
				Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'Source must be a valid text array with title and content', 'limb-chatbot' )
			);
		}

		// Step 1: Extract and normalize content (same as posts)
		$raw_content = $mapper->get_entry_output( $source );

		if ( empty( $raw_content ) ) {
			$exception = new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No content found in ', 'limb-chatbot' ),
				[ 'warning' => true ]
			);
			$exception->attach_link( '', $mapper->get_entry_input( $source ) );
			throw $exception;
		}

		// Normalize through Content_Extractor (same as posts)
		// This ensures images, links, and other HTML elements are properly processed
		// For text sources from WYSIWYG editors, apply WordPress filters first
		$content_with_filters = apply_filters( 'the_content', $raw_content );
		$normalized_content = $this->content_extractor->normalize( $content_with_filters );

		if ( empty( $normalized_content ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No content found after normalization', 'limb-chatbot' )
			);
		}

		// Update the dataset meta to enable edition in future
		// Clean HTML to only keep allowed tags without attributes
		$cleaned_content = $this->clean_html_for_source_content( $normalized_content );
		$dataset->update_meta( 'source_content', $cleaned_content );

		// Step 2: Chunk content with heading awareness
		$title  = $dataset->get_name() ?: ( $mapper->get_entry_input( $source ) ?: __( 'Text Content', 'limb-chatbot' ) );
		$chunks = $this->chunker->chunk( $normalized_content, $title );

		if ( empty( $chunks ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'Failed to chunk content into segments', 'limb-chatbot' )
			);
		}

		// Step 3: Process chunks into dataset entries
		$this->process_chunks_for_text( $dataset, $chunks, $normalized_content, $title );

		return $dataset;
	}

	/**
	 * Process chunks into dataset entries for text content.
	 *
	 * Similar to process_chunks but works without WP_Post objects.
	 * Creates chunk-based entries from extracted text content.
	 *
	 * @param  Dataset  $dataset  Dataset to store entries.
	 * @param  array  $chunks  Array of chunk data.
	 * @param  string  $normalized_content  Full normalized HTML content.
	 * @param  string  $title  Document title.
	 *
	 * @throws Exception If processing fails.
	 * @since 1.0.0
	 */
	protected function process_chunks_for_text(
		Dataset $dataset,
		array $chunks,
		string $normalized_content,
		string $title
	): void {
		$all_entries = new Collection();
		$source_url = $dataset->source_url() ?? '';
		$source_type = $dataset->get_source_type();

		// Build chunk-based entries with comprehensive metadata
		$chunk_index = 0;
		foreach ( $chunks as $chunk ) {
			try {
				// Enrich chunk with source metadata for better context
				$chunk['post_title'] = $title;
				$chunk['source_url'] = $source_url;
				$chunk['source_type'] = $source_type;
				$chunk['chunk_index'] = $chunk_index++;
				
				$entry = $this->entry_builder->build( $chunk );
				$entry->set_dataset_id( $dataset->get_id() );
				$all_entries->push_item( $entry );
			} catch ( \Exception $e ) {
				// Log individual entry build errors but don't fail the whole process
				Helper::log( [
					'error'      => 'Failed to build chunk entry',
					'dataset_id' => $dataset->get_id(),
					'chunk_data' => $chunk,
					'exception'  => $e->getMessage()
				], __METHOD__ );
			}
		}

		// Save all entries to the dataset
		if ( ! $all_entries->is_empty() ) {
			$all_entries->each( function ( Dataset_Entry $entry ) use ( $dataset ) {
				try {
					$entry->save();
				} catch ( \Exception $e ) {
					// Log individual entry save errors but don't fail the whole process
					Helper::log( [
						'error'      => 'Failed to save dataset entry',
						'dataset_id' => $dataset->get_id(),
						'entry_data' => $entry->get_entry(),
						'exception'  => $e->getMessage()
					], __METHOD__ );
				}
			} );
		} else {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No valid entries were generated from given source', 'limb-chatbot' )
			);
		}
	}
}