<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Chunkers\Utilities;

/**
 * Token Estimator Utility
 *
 * Provides fast, language-agnostic token estimation for chunking operations.
 * Uses character and word-based estimation without external dependencies.
 *
 * @since 1.2.1
 */
class Token_Estimator {

	/**
	 * Average characters per token (conservative estimate).
	 */
	private const CHARS_PER_TOKEN = 4;

	/**
	 * Average words per token (conservative estimate).
	 */
	private const WORDS_PER_TOKEN = 0.75;

	/**
	 * Estimate token count for a given text.
	 *
	 * Uses both character and word-based estimation and returns the maximum
	 * to ensure we don't underestimate token counts.
	 *
	 * @param string $text Text to estimate tokens for.
	 * @return int Estimated token count.
	 * @since 1.2.1
	 */
	public function estimate( string $text ): int {
		if ( empty( trim( $text ) ) ) {
			return 0;
		}

		// Normalize whitespace for accurate counting
		$normalized = preg_replace( '/\s+/', ' ', trim( $text ) );

		// Character-based estimation
		$char_tokens = (int) ceil( mb_strlen( $normalized ) / self::CHARS_PER_TOKEN );

		// Word-based estimation (more accurate for English)
		$word_count = str_word_count( $normalized );
		$word_tokens = (int) ceil( $word_count / self::WORDS_PER_TOKEN );

		// Return the maximum to ensure we don't underestimate
		// This is conservative and ensures chunks don't exceed limits
		return max( $char_tokens, $word_tokens );
	}

	/**
	 * Estimate tokens for multiple text segments.
	 *
	 * @param array $texts Array of text strings.
	 * @return int Total estimated token count.
	 * @since 1.2.1
	 */
	public function estimate_multiple( array $texts ): int {
		$total = 0;
		foreach ( $texts as $text ) {
			$total += $this->estimate( $text );
		}
		return $total;
	}

	/**
	 * Check if text exceeds maximum token limit.
	 *
	 * @param string $text Text to check.
	 * @param int    $max_tokens Maximum allowed tokens.
	 * @return bool True if text exceeds limit.
	 * @since 1.2.1
	 */
	public function exceeds_limit( string $text, int $max_tokens ): bool {
		return $this->estimate( $text ) > $max_tokens;
	}

	/**
	 * Get the minimum number of characters needed for a given token count.
	 *
	 * Useful for determining if content is too small to be a viable chunk.
	 *
	 * @param int $tokens Target token count.
	 * @return int Minimum characters needed.
	 * @since 1.2.1
	 */
	public function tokens_to_chars( int $tokens ): int {
		return $tokens * self::CHARS_PER_TOKEN;
	}
}

