<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Dataset_Builders;

use Limb_Chatbot\Includes\Data_Objects\AI_Model;
use Limb_Chatbot\Includes\Data_Objects\Chatbot;
use Limb_Chatbot\Includes\Data_Objects\Dataset;
use Limb_Chatbot\Includes\Data_Objects\Dataset_Meta;
use Limb_Chatbot\Includes\Data_Objects\Job;
use Limb_Chatbot\Includes\Data_Objects\Vector_Index;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Factories\Knowledge_Mapper_Factory;
use Limb_Chatbot\Includes\Interfaces\Dataset_Builder_Interface;
use Limb_Chatbot\Includes\Services\Helper;

class Informational_Dataset_Builder implements Dataset_Builder_Interface {

	/**
	 * Initialize or prepare a dataset for knowledge gathering.
	 *
	 * - Ensures a dataset exists for the given source.
	 * - Marks it as not synced for re-indexing.
	 * - Associates AI model and configuration metadata.
	 * - Registers the dataset in the indexing queue.
	 *
	 * @param  array  $data  {
	 *
	 * @return Dataset|null The prepared dataset or null on failure.
	 */
	public function build( array $data, Job $job ): ?Dataset {
		try {
			$model_id               = $job->get_config_value( 'indexing_ai_model_id' );
			$config_id              = $job->get_config_value( 'indexing_config_id' );
			$vector_index_type      = $job->get_config_value( 'indexing_vector_index_type' );
			$dimension              = $job->get_config_value( 'indexing_dimension' );
			$vector_index_config_id = $job->get_config_value( 'indexing_vector_index_config_id' );
			$chatbot_uuid           = $job->get_chatbot_uuid() === Job::CHATBOT_DEFAULT ? null : $job->get_chatbot_uuid();

			$mapper       = ( new Knowledge_Mapper_Factory() )->make( $data['source_type'] );
			$source       = $this->get_source( $job, $data );
			$source       = Helper::resolve_source_object( $data['source_type'], $source );
			$dataset_name = $mapper->get_entry_input( $source );
			$chatbot      = ! empty( $chatbot_uuid ) ? Chatbot::find_by_uuid( $chatbot_uuid ) : Chatbot::make();

			if ( ! $vector_index = $chatbot->get_vector_index_for_chatbot( $vector_index_config_id, $vector_index_type, $dimension ) ) {
				$vector_index = $chatbot->create_vector_index_for_chatbot( $vector_index_config_id, $vector_index_type, $dimension );
				if ( ! $vector_index instanceof Vector_Index ) {
					$string = __( 'Technical error happened during storage creation.', 'limb-chatbot' );
					throw new Exception( Error_Codes::TECHNICAL_ERROR, $string );
				}
			}
			$vector_index_id = $vector_index->get_id();
			// Ensure dataset exists or create new one
			$dataset = $this->check_dataset_existence( $data, $vector_index_id, $model_id );
			if ( empty( $dataset ) ) {
				$dataset = new Dataset( $data );
				$dataset->set_status( Dataset::STATUS_PENDING );
				$dataset->set_name( $dataset_name );
				$dataset->set_type( Dataset::TYPE_INFORMATIONAL_KNOWLEDGE );
				$dataset->save();
			}

			// Reset sync status and metadata
			$dataset->mark_not_synced();
			$dataset->update_meta( 'index_ai_model_id', $model_id );
			$dataset->update_meta( 'index_config_id', $config_id );
			$dataset->update_meta( 'vector_index_id', $vector_index_id );
			$dataset->update_meta( 'vector_index_config_id', $vector_index_config_id );
			$dataset->update_meta( 'vector_index_type', $vector_index_type );
			$dataset->update_meta( 'chatbot_uuid', $chatbot_uuid );
			$dataset->update_meta( 'dimension', $dimension );

			if ( $dataset->get_source_type() == Dataset::SOURCE_QA ) {
				$dataset->update_meta( 'q_a', $job->get_config_value( 'q_a' ) );
			} elseif ( $dataset->get_source_type() == Dataset::SOURCE_TEXT ) {
				$dataset->update_meta( 'text', $job->get_config_value( 'text' ) );
			} elseif ( $dataset->get_source_type() == Dataset::SOURCE_URL ) {
				// URL is stored in the source field, no need to store in meta
				// But we can store it for reference
				$dataset->update_meta( 'url', $dataset->get_source() );
			}

			return $dataset;
		} catch ( \Exception $e ) {
			Helper::log( $e );

			return null;
		}
	}

	private function check_dataset_existence( $data, $vector_index_id, $ai_model_id ) {
		$dataset = null;
		if ( ! in_array( $data['source_type'], [ Dataset::SOURCE_QA, Dataset::SOURCE_TEXT, Dataset::SOURCE_URL ] ) ) {
			$criteria = [
				'source_type'     => $data['source_type'],
				'source_sub_type' => $data['source_sub_type'] ?? null,
				'source'          => $data['source'],
				'type'            => Dataset::TYPE_INFORMATIONAL_KNOWLEDGE,
			];
			$datasets = Dataset::where( $criteria );
			if ( ! $datasets->is_empty() ) {
				foreach ( $datasets->get() as $item ) {
					if ( $item instanceof Dataset ) {
						$dataset_vector_index_id = $item->get_meta_value( 'vector_index_id' );
						if ( $dataset_vector_index_id != $vector_index_id ) {
							$dataset = null;
						} else {
							$ai_model = AI_Model::find( $ai_model_id );
							if ( $item->ai_provider_id() != $ai_model->get_ai_provider_id() ) {
								$dataset = null;
							} else {
								$dataset = $item;
								break;
							}
						}
					}
				}
			}
		}

		return $dataset;
	}

	private function get_source( Job $job, array $data ) {
		if ($data['source_type'] === Dataset::SOURCE_QA) {
			return $job->get_config_value( 'q_a' );
		} elseif ( $data['source_type'] === Dataset::SOURCE_TEXT ) {
			return $job->get_config_value( 'text' );
		} elseif ( $data['source_type'] === Dataset::SOURCE_URL ) {
			// For URL source, the source comes from the data array (set by fetcher)
			// Each URL becomes a separate dataset, so source is already in $data['source']
			return $data['source'];
		} else {
			return $data['source'];
		}
	}
}