<?php

namespace Limb_Chatbot\Includes\AI_Providers\Grok\Services;

use Limb_Chatbot\Includes\Data_Objects\Chatbot;
use Limb_Chatbot\Includes\Data_Objects\Message;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Interfaces\Token_Calculator_Interface;

/**
 * Class Token_Calculator
 *
 * Estimates the number of tokens in a given message for xAI.
 * xAI uses a tokenization similar to GPT models, so we use similar estimation logic.
 *
 * @package Limb_Chatbot\Includes\AI_Providers\Grok\Services
 * @since 1.0.12
 */
class Token_Calculator implements Token_Calculator_Interface {

	/**
	 * Pattern to match emojis and special characters.
	 *
	 * @var string
	 * @since 1.0.12
	 */
	const SPECIAL_CHARACTERS_PATTERN = '/[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}]/u';

	/**
	 * Threshold for small words (1 token).
	 *
	 * @var int
	 * @since 1.0.12
	 */
	const SMALL_WORD_THRESHOLD = 4;

	/**
	 * Threshold for medium words (2 tokens).
	 *
	 * @var int
	 * @since 1.0.12
	 */
	const MEDIUM_WORD_THRESHOLD = 8;

	/**
	 * Average characters per token.
	 *
	 * @var int
	 * @since 1.0.12
	 */
	const CHAR_PER_TOKEN = 4;

	/**
	 * Additional token count for special characters.
	 *
	 * @var int
	 * @since 1.0.12
	 */
	const ADDITIONAL_TOKEN_FOR_SPECIAL_CHARS = 1;

	/**
	 * The message instance for which token calculation is performed.
	 *
	 * @var Message
	 * @since 1.0.12
	 */
	protected Message $message;

	/**
	 * Optional chatbot context.
	 *
	 * @var Chatbot|null
	 * @since 1.0.12
	 */
	protected ?Chatbot $chatbot = null;

	/**
	 * Token_Calculator constructor.
	 *
	 * @param Message      $message Message object containing content to be tokenized.
	 * @param Chatbot|null $chatbot Optional chatbot context.
	 *
	 * @since 1.0.12
	 */
	public function __construct( Message $message, ?Chatbot $chatbot = null ) {
		$this->message = $message;
		$this->chatbot = $chatbot;
	}

	/**
	 * Calculates the estimated total token usage for the given message.
	 *
	 * @return int Total token count.
	 *
	 * @throws Exception If attachment data is invalid.
	 * @since 1.0.12
	 */
	public function calculate(): int {
		$tokens = 0;
		foreach ( $this->message->get_content() as $content ) {
			$method = "count_{$content['type']}_tokens";
			if ( method_exists( $this, $method ) ) {
				$tokens += $this->$method( $content[ $content['type'] ]['value'] );
			}
		}

		return $tokens;
	}

	/**
	 * Count tokens for text content.
	 *
	 * @param string $text The text to count tokens for.
	 *
	 * @return int Estimated token count.
	 * @since 1.0.12
	 */
	protected function count_text_tokens( string $text ): int {
		if ( empty( $text ) ) {
			return 0;
		}

		$tokens = 0;

		// Split text into words
		$words = preg_split( '/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY );

		foreach ( $words as $word ) {
			$len = mb_strlen( $word );

			if ( $len <= self::SMALL_WORD_THRESHOLD ) {
				$tokens += 1;
			} elseif ( $len <= self::MEDIUM_WORD_THRESHOLD ) {
				$tokens += 2;
			} else {
				// Longer words: estimate based on character count
				$tokens += (int) ceil( $len / self::CHAR_PER_TOKEN );
			}
		}

		// Count special characters (emojis, etc.)
		preg_match_all( self::SPECIAL_CHARACTERS_PATTERN, $text, $matches );
		$tokens += count( $matches[0] ) * self::ADDITIONAL_TOKEN_FOR_SPECIAL_CHARS;

		return $tokens;
	}
}
