<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Builders;

use Limb_Chatbot\Includes\Data_Objects\Dataset_Entry;

/**
 * Chunk Entry Builder for creating dataset entries from chunks.
 *
 * Builds dataset entries with the same structure as QA entries,
 * but using chunk content instead of question-answer pairs.
 *
 * @since 1.0.0
 */
class Chunk_Entry_Builder {

	/**
	 * Build a dataset entry from a chunk.
	 *
	 * @param  array  $chunk  Chunk data with structure:
	 *                     [
	 *                       'title' => string,
	 *                       'heading' => string,
	 *                       'heading_path' => string,
	 *                       'content' => string,
	 *                       'tokens' => int (optional),
	 *                       'chunk_index' => int (optional),
	 *                       'overlap_start' => string|null (optional),
	 *                       'overlap_end' => string|null (optional),
	 *                       'path_array' => array (optional),
	 *                       'metadata' => array (optional),
	 *                       'post_title' => string (optional),
	 *                       'source_url' => string (optional),
	 *                       'source_type' => string (optional),
	 *                     ]
	 *
	 * @return Dataset_Entry Dataset entry object.
	 * @since 1.0.0
	 */
	public function build( array $chunk ): Dataset_Entry {
		// Build the entry structure matching the existing format
		// For chunks, we use the title as input and content as output
		// This maintains compatibility with the existing dataset entry structure
		$entry_data = [
			'system'   => '',
			'messages' => [
				[
					'input'  => $chunk['title'] ?? $chunk['heading'] ?? '',
					'output' => $chunk['content'] ?? '',
				]
			],
		];
		$path_array = $chunk['path_array'] ?? [];

		// Extract metadata from chunk or use provided metadata
		$chunk_metadata = $chunk['metadata'] ?? [];

		// Store comprehensive chunk metadata in the entry
		// This provides rich context for RAG systems and LLM inference
		$entry_data['entry_metadata'] = [
			'entry_type'      => 'chunk',
			'heading'         => $chunk['heading'] ?? $chunk['title'] ?? '',
			'heading_path'    => $chunk['heading_path'] ?? $chunk_metadata['heading_path'] ?? '',
			'post_title'      => $chunk['post_title'] ?? $chunk_metadata['post_title'] ?? '',
			'source_url'      => $chunk['source_url'] ?? $chunk_metadata['source_url'] ?? '',
			'source_type'     => $chunk['source_type'] ?? $chunk_metadata['source_type'] ?? '',
			'chunk_index'     => $chunk['chunk_index'] ?? $chunk_metadata['chunk_index'] ?? null,
			'tokens'          => $chunk['tokens'] ?? $chunk_metadata['tokens'] ?? null,
			'overlap_start'   => $chunk['overlap_start'] ?? null,
			'overlap_end'     => $chunk['overlap_end'] ?? null,
			'heading_level'   => $chunk_metadata['heading_level'] ?? end( $path_array )['level'] ?? null,
			'parent_headings' => $chunk_metadata['parent_headings'] ?? array_column( array_slice( $chunk['path_array'] ?? [], 0, - 1 ), 'text' ),
			'semantic_type'   => $chunk_metadata['semantic_type'] ?? null,
		];

		$dataset_entry = Dataset_Entry::make();
		$dataset_entry->set_entry( $entry_data );

		return $dataset_entry;
	}
}
