<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Chunkers;

use Limb_Chatbot\Includes\Services\Knowledge\Chunkers\Utilities\Token_Estimator;

/**
 * Enhanced Heading-Aware Semantic Chunker (Enterprise RAG-Optimized)
 *
 * Enhanced Features:
 * - Token-based chunking with configurable min/max
 * - Smart chunk distribution avoiding tiny/large chunks
 * - Configurable overlap (default 15%) with optional baking into content
 * - Intelligent semantic boundary splitting with added support for code blocks
 * - Rich metadata including semantic types and hierarchy
 * - Handles h1-h6 headings and plain text fallback
 * - Improved format preservation: ordered lists, basic tables to markdown
 * - Input type detection for HTML vs plain text
 * - Performance optimizations: reduced regex, caching where possible
 *
 * @since 1.3.0
 */
class Heading_Aware_Chunker {

	/** Maximum chunk size in tokens */
	private int $max_tokens = 800;

	/** Minimum viable chunk size in tokens */
	private int $min_tokens = 200;

	/** Overlap percentage */
	private float $overlap_percentage = 0.15;

	/** Whether to bake overlap into chunk content */
	private bool $bake_overlap = false;

	/** Token estimator instance */
	private Token_Estimator $token_estimator;

	/**
	 * Constructor.
	 *
	 * @param  int|null  $max_tokens  Maximum tokens per chunk (default: 800).
	 * @param  int|null  $min_tokens  Minimum tokens per chunk (default: 200 or 25% of max).
	 * @param  float|null  $overlap_percentage  Overlap percentage (default: 0.15).
	 * @param  bool  $bake_overlap  Whether to bake overlap into content (default: false, uses metadata).
	 *
	 * @since 1.3.0
	 */
	public function __construct(
		?int $max_tokens = null,
		?int $min_tokens = null,
		?float $overlap_percentage = null,
		bool $bake_overlap = false
	) {
		$this->token_estimator = new Token_Estimator();
		if ( $max_tokens !== null ) {
			$this->max_tokens = $max_tokens;
		}
		if ( $min_tokens !== null ) {
			$this->min_tokens = $min_tokens;
		} else {
			$this->min_tokens = (int) ( $this->max_tokens * 0.25 );
		}
		if ( $overlap_percentage !== null ) {
			$this->overlap_percentage = $overlap_percentage;
		}
		$this->bake_overlap = $bake_overlap;
	}

	/**
	 * Chunk content semantically with token-based sizing.
	 *
	 * Automatically detects if input is HTML or plain text.
	 *
	 * @param  string  $content  Content to chunk (HTML or plain text).
	 * @param  string  $title  Document title.
	 *
	 * @return array Array of chunked content with rich metadata.
	 * @since 1.3.0
	 */
	public function chunk( string $content, string $title ): array {
		if ( empty( $content ) ) {
			return [];
		}

		// Detect if content is HTML
		$is_html = preg_match( '/<[^>]+>/', $content );

		if ( $is_html ) {
			$sections = $this->build_sections_from_html( $content, $title );
		} else {
			$sections = $this->build_sections_from_plain_text( $content, $title );
		}

		if ( empty( $sections ) ) {
			return [];
		}

		// Process sections with smart distribution and overlap
		return $this->process_sections_with_smart_distribution( $sections, $title );
	}

	/* =============================================================
	 * SECTION BUILDING (Enhanced HTML Parsing)
	 * ============================================================= */

	/**
	 * Build sections from HTML with enhanced parsing (h1-h6).
	 *
	 * @param  string  $html  HTML content.
	 * @param  string  $title  Document title.
	 *
	 * @return array Array of sections with metadata.
	 * @since 1.3.0
	 */
	private function build_sections_from_html( string $html, string $title ): array {
		$sections = [];

		// Enhanced regex to handle h1-h6
		$parts = preg_split(
			'/(<h[1-6][^>]*>.*?<\/h[1-6]>)/is',
			$html,
			- 1,
			PREG_SPLIT_DELIM_CAPTURE
		);

		$current_heading = $title;
		$current_path    = [ [ 'level' => 1, 'text' => $title ] ];
		$current_content = '';

		foreach ( $parts as $part ) {
			$part = trim( $part );
			if ( $part === '' ) {
				continue;
			}

			// Match h1-h6 headings
			if ( preg_match( '/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/is', $part, $m ) ) {
				$this->flush_section( $sections, $current_heading, $current_path, $current_content );

				$level = (int) $m[1];
				$text  = trim( html_entity_decode( wp_strip_all_tags( $m[2] ) ) );

				if ( ! empty( $text ) ) {
					$current_path    = $this->update_heading_path( $current_path, $text, $level );
					$current_heading = $text;
					$current_content = '';
				}
			} else {
				$current_content .= ' ' . $part;
			}
		}

		$this->flush_section( $sections, $current_heading, $current_path, $current_content );

		return $sections;
	}

	/**
	 * Flush a section to the sections array.
	 *
	 * @param  array  $sections  Sections array (by reference).
	 * @param  string  $heading  Section heading.
	 * @param  array  $path  Heading path hierarchy.
	 * @param  string  $html  HTML content.
	 *
	 * @return void
	 * @since 1.3.0
	 */
	private function flush_section( array &$sections, string $heading, array $path, string $html ): void {
		$content = $this->extract_clean_text( $html );
		if ( empty( trim( $content ) ) ) {
			return;
		}

		// Prepend the heading to the content as markdown
		// Get the heading level from the path (last item in path)
		$heading_level = 2; // Default to h2
		if ( ! empty( $path ) ) {
			$last_path_item = end( $path );
			$heading_level   = $last_path_item['level'] ?? 2;
		}
		
		// Format heading as markdown
		$hashes = str_repeat( '#', $heading_level );
		$heading_markdown = "\n\n{$hashes} {$heading}\n\n";
		
		// Prepend heading to content
		$content = $heading_markdown . $content;

		$tokens = $this->token_estimator->estimate( $content );

		$sections[] = [
			'heading'      => $heading,
			'heading_path' => $this->build_heading_path( $path ),
			'path_array'   => $path,
			'content'      => $content,
			'tokens'       => $tokens,
			'raw_html'     => $html, // Keep for semantic boundary detection
		];
	}

	/**
	 * Extract clean text from HTML/plain while preserving markdown structure.
	 *
	 * Enhanced: Better list handling (preserves ordered lists), basic table to markdown.
	 *
	 * @param  string  $content  HTML or plain content.
	 *
	 * @return string Clean text with markdown formatting.
	 * @since 1.3.0
	 */
	private function extract_clean_text( string $content ): string {
		// If plain text, skip HTML-specific cleaning
		$is_html = preg_match( '/<[^>]+>/', $content );
		if ( ! $is_html ) {
			// Normalize plain text whitespace
			$content = preg_replace( '/[ \t]+/', ' ', $content );
			$content = preg_replace( '/\n{3,}/', "\n\n", $content );

			return trim( $content );
		}

		// Remove scripts and styles
		$content = preg_replace( '/<script[^>]*>.*?<\/script>/is', '', $content );
		$content = preg_replace( '/<style[^>]*>.*?<\/style>/is', '', $content );

		// Convert headings to markdown
		$content = preg_replace_callback( '/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/is', function ( $matches ) {
			$level  = (int) $matches[1];
			$text   = trim( wp_strip_all_tags( $matches[2] ) );
			$hashes = str_repeat( '#', $level );

			return "\n\n{$hashes} {$text}\n\n";
		}, $content );

		// Convert tables to markdown (basic)
		$content = preg_replace_callback(
			'/<table[^>]*>(.*?)<\/table>/is',
			function ( $matches ) {
				$html = $matches[1];
				// Extract rows
				preg_match_all( '/<tr[^>]*>(.*?)<\/tr>/is', $html, $row_matches );
				if ( empty( $row_matches[1] ) ) {
					return $matches[0];
				}
				$rows       = [];
				$has_header = false;
				$max_cols   = 0;
				foreach ( $row_matches[1] as $row_html ) {
					preg_match_all( '/<(td|th)[^>]*>(.*?)<\/\1>/is', $row_html, $cell_matches );
					if ( empty( $cell_matches[2] ) ) {
						continue;
					}
					$cells = array_map(
						static function ( $cell ) {
							$cell = preg_replace( '/<br\s*\/?>/i', '<br>', $cell );

							return trim(
								wp_strip_all_tags(
									html_entity_decode( $cell, ENT_QUOTES | ENT_HTML5, 'UTF-8' ),
									'<br>'
								)
							);
						},
						$cell_matches[2]
					);
					$rows[]   = $cells;
					$max_cols = max( $max_cols, count( $cells ) );
					if ( in_array( 'th', $cell_matches[1], true ) ) {
						$has_header = true;
					}
				}
				if ( empty( $rows ) ) {
					return $matches[0];
				}
				// Normalize column count (NO references)
				foreach ( $rows as $i => $row ) {
					$rows[ $i ] = array_pad( $row, $max_cols, '' );
				}
				// Build Markdown
				$markdown = '';
				foreach ( $rows as $index => $row ) {
					$markdown .= '| ' . implode( ' | ', $row ) . " |\n";
					if ( $index === 0 && $has_header ) {
						$markdown .= '| ' . implode(
								' | ',
								array_fill( 0, $max_cols, '---' )
							) . " |\n";
					}
				}
				return "\n" . rtrim( $markdown ) . "\n";
			},
			$content
		);


		// Convert links to markdown - handle all variations
		$content = preg_replace_callback( '/<a([^>]*)>(.*?)<\/a>/is', function ( $matches ) {
			$attrs = $matches[1];
			$link_text = $matches[2];
			
			// Extract href attribute (can be in any order)
			$href = '';
			if ( preg_match( '/href=["\']([^"\']*)["\']/i', $attrs, $href_match ) ) {
				$href = $href_match[1];
			}
			
			// If no href, skip this link (anchor without destination)
			if ( empty( $href ) ) {
				// Return just the text content without the link
				return trim( wp_strip_all_tags( $link_text ) );
			}
			
			// Clean link text - strip HTML tags but preserve text
			$clean_text = trim( wp_strip_all_tags( $link_text ) );
			
			// If link text is empty after stripping, use href as fallback
			if ( empty( $clean_text ) ) {
				$clean_text = $href;
			}
			
			// Build markdown link: [text](url)
			return "[{$clean_text}]({$href})";
		}, $content );

		// Universal image extraction - handles ALL image scenarios
		// Process in order: picture tags, figure tags, then all other images

		// 1. Handle picture tags (responsive images with source elements)
		$content = preg_replace_callback( '/<picture[^>]*>(.*?)<\/picture>/is', function ( $matches ) {
			$picture_content = $matches[1];
			// Extract img from picture tag
			if ( preg_match( '/<img([^>]*)>/i', $picture_content, $img_match ) ) {
				return "\n\n" . $this->convert_image_to_markdown( $img_match[1], '' ) . "\n\n";
			}

			return '';
		}, $content );

		// 2. Handle figure tags with images and captions
		$content = preg_replace_callback( '/<figure[^>]*>(.*?)<\/figure>/is', function ( $matches ) {
			$figure_content = $matches[1];

			// Extract image from figure (could be nested in divs, etc.)
			$image_markdown = '';
			if ( preg_match( '/<img([^>]*)>/i', $figure_content, $img_match ) ) {
				$attrs = $img_match[1];

				// Extract src (including data-src for lazy loading)
				$src = '';
				if ( preg_match( '/src=["\']([^"\']*)["\']/i', $attrs, $src_match ) ) {
					$src = $src_match[1];
				} elseif ( preg_match( '/data-src=["\']([^"\']*)["\']/i', $attrs, $data_src_match ) ) {
					// Lazy loading fallback
					$src = $data_src_match[1];
				} elseif ( preg_match( '/data-lazy-src=["\']([^"\']*)["\']/i', $attrs, $lazy_src_match ) ) {
					// Another lazy loading pattern
					$src = $lazy_src_match[1];
				}

				// Extract alt
				$alt = '';
				if ( preg_match( '/alt=["\']([^"\']*)["\']/i', $attrs, $alt_match ) ) {
					$alt = $alt_match[1];
				}

				// Extract title as fallback
				if ( empty( $alt ) && preg_match( '/title=["\']([^"\']*)["\']/i', $attrs, $title_match ) ) {
					$alt = $title_match[1];
				}

				// Extract figcaption for alt text if no alt
				if ( empty( $alt ) && preg_match( '/<figcaption[^>]*>(.*?)<\/figcaption>/is',
						$figure_content,
						$caption_match ) ) {
					$alt = trim( wp_strip_all_tags( $caption_match[1] ) );
				}

				// Build markdown image
				if ( ! empty( $src ) && ! empty( $alt ) ) {
					$image_markdown = "![{$alt}]({$src})";
				} elseif ( ! empty( $src ) ) {
					$image_markdown = "![]({$src})";
				} elseif ( ! empty( $alt ) ) {
					$image_markdown = "![{$alt}]";
				} else {
					$image_markdown = "![Image]";
				}
			}

			// Extract figcaption for additional context
			$caption = '';
			if ( preg_match( '/<figcaption[^>]*>(.*?)<\/figcaption>/is', $figure_content, $caption_match ) ) {
				$caption_text = trim( wp_strip_all_tags( $caption_match[1] ) );
				if ( ! empty( $caption_text ) && strpos( $image_markdown, $caption_text ) === false ) {
					// Add caption below image if not already in alt
					$caption = "\n*{$caption_text}*";
				}
			}

			return "\n\n{$image_markdown}{$caption}\n\n";
		}, $content );

		// 3. Convert ALL remaining images to markdown (anywhere in HTML)
		// This catches images in divs, spans, paragraphs, or anywhere else
		$content = preg_replace_callback( '/<img([^>]*)>/i', function ( $matches ) {
			$attrs = $matches[1];

			return $this->convert_image_to_markdown( $attrs, '' );
		}, $content );

		// Convert bold/strong to markdown
		$content = preg_replace( '/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/is', '**$2**', $content );

		// Convert italic/em to markdown
		$content = preg_replace( '/<(em|i)[^>]*>(.*?)<\/(em|i)>/is', '*$2*', $content );

		// Convert code to markdown
		$content = preg_replace( '/<code[^>]*>(.*?)<\/code>/is', '`$1`', $content );
		$content = preg_replace( '/<pre[^>]*>(.*?)<\/pre>/is', "\n```\n$1\n```\n", $content );

		// Convert blockquotes to markdown
		$content = preg_replace_callback( '/<blockquote[^>]*>(.*?)<\/blockquote>/is', function ( $matches ) {
			$inner = trim( wp_strip_all_tags( $matches[1] ) );

			return "\n> {$inner}\n";
		}, $content );

		// Convert unordered lists to markdown
		$content = preg_replace( '/<ul[^>]*>/i', "\n", $content );
		$content = preg_replace( '/<\/ul>/i', "\n", $content );

		// Convert ordered lists to markdown with numbers
		$content = preg_replace_callback( '/<ol[^>]*>(.*?)<\/ol>/is', function ( $matches ) {
			$items   = preg_split( '/<li[^>]*>/i', $matches[1], - 1, PREG_SPLIT_NO_EMPTY );
			$md_list = '';
			foreach ( $items as $idx => $item ) {
				$item = trim( wp_strip_all_tags( $item ) );
				if ( ! empty( $item ) ) {
					$md_list .= ( $idx + 1 ) . ". {$item}\n";
				}
			}

			return "\n" . $md_list . "\n";
		}, $content );

		// Convert list items (fallback for mixed)
		$content = preg_replace_callback( '/<li[^>]*>(.*?)<\/li>/is', function ( $matches ) {
			$inner = trim( wp_strip_all_tags( $matches[1] ) );

			return "- {$inner}\n";
		}, $content );

		// Convert paragraphs
		$content = preg_replace( '/<p[^>]*>/i', "\n", $content );
		$content = preg_replace( '/<\/p>/i', "\n", $content );

		// Convert line breaks
		$content = preg_replace( '/<br\s*\/?>/i', "\n", $content );

		// Strip remaining HTML tags (markdown images are already converted, so they're safe)
		$content = wp_strip_all_tags( $content );

		// Decode HTML entities
		$content = html_entity_decode( $content, ENT_QUOTES | ENT_HTML5, 'UTF-8' );

		// Normalize whitespace
		$content = preg_replace( '/[ \t]+/', ' ', $content );
		$content = preg_replace( '/\n{3,}/', "\n\n", $content );
		$content = preg_replace( '/[ \t]+\n/', "\n", $content );

		return trim( $content );
	}

	/**
	 * Convert image attributes to markdown format.
	 *
	 * Handles all image scenarios:
	 * - Standard src attribute
	 * - Lazy loading (data-src, data-lazy-src)
	 * - Alt, title attributes
	 * - Any container structure
	 *
	 * @param  string  $attrs  Image tag attributes.
	 * @param  string  $caption  Optional caption text.
	 *
	 * @return string Markdown image syntax.
	 * @since 1.2.1
	 */
	private function convert_image_to_markdown( string $attrs, string $caption = '' ): string {
		// Extract src attribute (including lazy loading variants)
		$src = '';

		// Try standard src first
		if ( preg_match( '/src=["\']([^"\']*)["\']/i', $attrs, $src_match ) ) {
			$src = $src_match[1];
		} elseif ( preg_match( '/data-src=["\']([^"\']*)["\']/i', $attrs, $data_src_match ) ) {
			// Lazy loading: data-src
			$src = $data_src_match[1];
		} elseif ( preg_match( '/data-lazy-src=["\']([^"\']*)["\']/i', $attrs, $lazy_src_match ) ) {
			// Lazy loading: data-lazy-src
			$src = $lazy_src_match[1];
		} elseif ( preg_match( '/data-original=["\']([^"\']*)["\']/i', $attrs, $original_match ) ) {
			// Lazy loading: data-original
			$src = $original_match[1];
		}

		// Extract alt attribute
		$alt = '';
		if ( preg_match( '/alt=["\']([^"\']*)["\']/i', $attrs, $alt_match ) ) {
			$alt = $alt_match[1];
		}

		// Extract title as fallback for alt
		if ( empty( $alt ) && preg_match( '/title=["\']([^"\']*)["\']/i', $attrs, $title_match ) ) {
			$alt = $title_match[1];
		}

		// Use provided caption if no alt
		if ( empty( $alt ) && ! empty( $caption ) ) {
			$alt = $caption;
		}

		// Build markdown image syntax: ![alt](url)
		if ( ! empty( $src ) && ! empty( $alt ) ) {
			return "![{$alt}]({$src})";
		} elseif ( ! empty( $src ) ) {
			// Has URL but no alt text
			return "![]({$src})";
		} elseif ( ! empty( $alt ) ) {
			// Has alt but no URL (shouldn't happen, but handle gracefully)
			return "![{$alt}]";
		} else {
			// No src or alt - placeholder
			return "![Image]";
		}
	}

	/* =============================================================
	 * TEXT NORMALIZATION (Enhanced)
	 * ============================================================= */

	/**
	 * Build heading path string from path array.
	 *
	 * @param  array  $path  Heading path array.
	 *
	 * @return string Heading path string.
	 * @since 1.3.0
	 */
	private function build_heading_path( array $path ): string {
		return implode( ' > ', array_column( $path, 'text' ) );
	}

	/* =============================================================
	 * HEADING PATH HELPERS
	 * ============================================================= */

	/**
	 * Update heading path with new heading.
	 *
	 * @param  array  $path  Current path.
	 * @param  string  $heading  New heading text.
	 * @param  int  $level  Heading level.
	 *
	 * @return array Updated path.
	 * @since 1.3.0
	 */
	private function update_heading_path( array $path, string $heading, int $level ): array {
		// Remove headings at same or higher level
		$path   = array_filter( $path, fn( $h ) => $h['level'] < $level );
		$path[] = [ 'level' => $level, 'text' => $heading ];

		return array_values( $path );
	}

	/**
	 * Build sections from plain text (fallback for non-HTML).
	 *
	 * Treats double newlines as paragraph boundaries and infers "sections" from them.
	 *
	 * @param  string  $text  Plain text content.
	 * @param  string  $title  Document title.
	 *
	 * @return array Array of sections with metadata.
	 * @since 1.3.0
	 */
	private function build_sections_from_plain_text( string $text, string $title ): array {
		$sections   = [];
		$paragraphs = explode( "\n\n", $text ); // Split on paragraphs

		$current_heading = $title;
		$current_path    = [ [ 'level' => 1, 'text' => $title ] ];
		$current_content = '';

		foreach ( $paragraphs as $index => $para ) {
			$para = trim( $para );
			if ( empty( $para ) ) {
				continue;
			}

			// Infer "headings" from first line if it looks like one (e.g., all caps or ends with :)
			$lines      = explode( "\n", $para );
			$first_line = trim( $lines[0] );
			if ( preg_match( '/^[A-Z0-9\s:]+$|:\s*$/', $first_line ) && count( $lines ) > 1 ) {
				$this->flush_section( $sections, $current_heading, $current_path, $current_content );

				$level           = 2; // Assume subheading level
				$current_path    = $this->update_heading_path( $current_path, $first_line, $level );
				$current_heading = $first_line;
				$current_content = implode( "\n", array_slice( $lines, 1 ) );
			} else {
				$current_content .= "\n\n" . $para;
			}
		}

		$this->flush_section( $sections, $current_heading, $current_path, $current_content );

		return $sections;
	}

	/* =============================================================
	 * SMART CHUNK DISTRIBUTION WITH OVERLAP
	 * ============================================================= */

	/**
	 * Process sections with smart distribution and overlap.
	 *
	 * @param  array  $sections  Sections to process.
	 * @param  string  $title  Document title.
	 *
	 * @return array Processed chunks with metadata.
	 * @since 1.3.0
	 */
	private function process_sections_with_smart_distribution( array $sections, string $title ): array {
		$chunks = [];
		$buffer = null;

		foreach ( $sections as $index => $section ) {
			$section_tokens = $section['tokens'] ?? $this->token_estimator->estimate( $section['content'] );

			if ( $section_tokens <= $this->max_tokens ) {
				if ( $buffer !== null ) {
					$buffer_tokens   = $buffer['tokens'] ?? $this->token_estimator->estimate( $buffer['content'] );
					$combined_tokens = $buffer_tokens + $section_tokens;

					if ( $combined_tokens <= $this->max_tokens ) {
						$buffer = $this->merge_sections( $buffer, $section );
						continue;
					} else {
						$chunks[] = $this->finalize_chunk( $buffer, $title, count( $chunks ) );
						$buffer   = null;
					}
				}

				if ( $section_tokens < $this->min_tokens ) {
					if ( $buffer === null ) {
						$buffer = $section;
						continue;
					} else {
						$buffer = $this->merge_sections( $buffer, $section );
						continue;
					}
				}

				if ( $buffer !== null ) {
					$chunks[] = $this->finalize_chunk( $buffer, $title, count( $chunks ) );
					$buffer   = null;
				}

				$chunks[] = $this->finalize_chunk( $section, $title, count( $chunks ) );
			} else {
				if ( $buffer !== null ) {
					$chunks[] = $this->finalize_chunk( $buffer, $title, count( $chunks ) );
					$buffer   = null;
				}

				$split_chunks = $this->split_large_section( $section, $title, count( $chunks ) );
				$chunks       = array_merge( $chunks, $split_chunks );
			}
		}

		if ( $buffer !== null ) {
			$chunks[] = $this->finalize_chunk( $buffer, $title, count( $chunks ) );
		}

		return $this->add_overlap_to_chunks( $chunks );
	}

	/**
	 * Merge two sections.
	 *
	 * @param  array  $section1  First section.
	 * @param  array  $section2  Second section.
	 *
	 * @return array Merged section.
	 * @since 1.3.0
	 */
	private function merge_sections( array $section1, array $section2 ): array {
		$merged            = $section1;
		$merged['content'] = trim( $section1['content'] ) . "\n\n" . trim( $section2['content'] );
		$merged['tokens']  = $this->token_estimator->estimate( $merged['content'] );

		// Use the higher level heading
		$level1 = end( $section1['path_array'] )['level'] ?? 1;
		$level2 = end( $section2['path_array'] )['level'] ?? 1;
		if ( $level2 < $level1 ) {
			$merged['heading']      = $section2['heading'];
			$merged['path_array']   = $section2['path_array'];
			$merged['heading_path'] = $section2['heading_path'];
		}

		return $merged;
	}

	/**
	 * Finalize a chunk with metadata.
	 *
	 * @param  array  $section  Section to finalize.
	 * @param  string  $title  Document title.
	 * @param  int  $chunk_index  Chunk index.
	 *
	 * @return array Finalized chunk.
	 * @since 1.3.0
	 */
	private function finalize_chunk( array $section, string $title, int $chunk_index ): array {
		$heading_path    = $section['heading_path'] ?? $this->build_heading_path( $section['path_array'] ?? [] );
		$heading_level   = end( $section['path_array'] )['level'] ?? 1;
		$parent_headings = array_slice( $section['path_array'] ?? [], 0, - 1 );

		return [
			'title'         => $section['heading'] ?? $title,
			'heading'       => $section['heading'] ?? $title,
			'heading_path'  => $heading_path ?: $title,
			'content'       => $section['content'] ?? '',
			'tokens'        => $section['tokens'] ?? $this->token_estimator->estimate( $section['content'] ?? '' ),
			'chunk_index'   => $chunk_index,
			'overlap_start' => null,
			'overlap_end'   => null,
			'path_array'    => $section['path_array'] ?? [ [ 'level' => 1, 'text' => $title ] ],
			'metadata'      => [
				'heading_level'   => $heading_level,
				'parent_headings' => array_column( $parent_headings, 'text' ),
				'semantic_type'   => $this->detect_semantic_type( $section['content'] ?? '' ),
			],
		];
	}

	/**
	 * Detect semantic type of content.
	 *
	 * @param  string  $content  Content to analyze.
	 *
	 * @return string Semantic type.
	 * @since 1.3.0
	 */
	private function detect_semantic_type( string $content ): string {
		if ( preg_match( '/^[\s]*[-*]\s/m', $content ) || preg_match( '/^\d+\.\s/m', $content ) ) {
			return 'list';
		}
		if ( preg_match( '/^>/m', $content ) ) {
			return 'quote';
		}
		if ( preg_match( '/^\|.*\|$/m', $content ) ) {
			return 'table';
		}
		if ( preg_match( '/^```/m', $content ) ) {
			return 'code';
		}

		return 'paragraph';
	}

	/**
	 * Split a large section intelligently at semantic boundaries.
	 *
	 * @param  array  $section  Section to split.
	 * @param  string  $title  Document title.
	 * @param  int  $start_index  Starting chunk index.
	 *
	 * @return array Array of split chunks.
	 * @since 1.3.0
	 */
	private function split_large_section( array $section, string $title, int $start_index ): array {
		$content      = $section['content'];
		$total_tokens = $section['tokens'];
		$chunks       = [];

		$num_chunks         = (int) ceil( $total_tokens / $this->max_tokens );
		$ideal_chunk_tokens = (int) ( $total_tokens / $num_chunks );

		$boundaries = $this->find_semantic_boundaries( $content );

		if ( empty( $boundaries ) ) {
			$boundaries = $this->find_sentence_boundaries( $content );
		}

		$current_pos           = 0;
		$current_chunk_content = '';
		$current_chunk_tokens  = 0;
		$chunk_index           = $start_index;

		foreach ( $boundaries as $boundary_pos ) {
			$segment        = mb_substr( $content, $current_pos, $boundary_pos - $current_pos );
			$segment_tokens = $this->token_estimator->estimate( $segment );

			if ( $current_chunk_tokens + $segment_tokens > $ideal_chunk_tokens && $current_chunk_tokens >= $this->min_tokens ) {
				$chunk    = $this->create_chunk_from_section(
					$section,
					trim( $current_chunk_content ),
					$title,
					$chunk_index ++
				);
				$chunks[] = $chunk;

				$current_chunk_content = $segment;
				$current_chunk_tokens  = $segment_tokens;
				$current_pos           = $boundary_pos;
			} else {
				$current_chunk_content .= $segment;
				$current_chunk_tokens  += $segment_tokens;
				$current_pos           = $boundary_pos;
			}

			if ( $current_chunk_tokens > $this->max_tokens ) {
				$last_boundary = $this->find_last_safe_split( $current_chunk_content, $this->max_tokens );
				if ( $last_boundary > 0 ) {
					$first_part  = mb_substr( $current_chunk_content, 0, $last_boundary );
					$second_part = mb_substr( $current_chunk_content, $last_boundary );

					$chunk    = $this->create_chunk_from_section(
						$section,
						trim( $first_part ),
						$title,
						$chunk_index ++
					);
					$chunks[] = $chunk;

					$current_chunk_content = $second_part;
					$current_chunk_tokens  = $this->token_estimator->estimate( $second_part );
				}
			}
		}

		if ( ! empty( trim( $current_chunk_content ) ) ) {
			$remaining_tokens = $this->token_estimator->estimate( $current_chunk_content );

			// Safety check: If remaining content exceeds max_tokens, force split it
			if ( $remaining_tokens > $this->max_tokens ) {
				$force_split_chunks = $this->force_split_large_content(
					$current_chunk_content,
					$section,
					$title,
					$chunk_index
				);
				$chunks = array_merge( $chunks, $force_split_chunks );
			} elseif ( $remaining_tokens < $this->min_tokens && ! empty( $chunks ) ) {
				$last_chunk            = array_pop( $chunks );
				$last_chunk['content'] .= "\n\n" . trim( $current_chunk_content );
				$last_chunk['tokens']  = $this->token_estimator->estimate( $last_chunk['content'] );
				
				// Safety check: If merged chunk exceeds max_tokens, split it
				if ( $last_chunk['tokens'] > $this->max_tokens ) {
					$chunks[] = $this->create_chunk_from_section(
						$section,
						trim( $current_chunk_content ),
						$title,
						$chunk_index
					);
				} else {
					$chunks[] = $last_chunk;
				}
			} else {
				$chunk    = $this->create_chunk_from_section(
					$section,
					trim( $current_chunk_content ),
					$title,
					$chunk_index
				);
				$chunks[] = $chunk;
			}
		}

		// Final safety pass: Ensure no chunk exceeds max_tokens
		return $this->validate_and_fix_chunk_sizes( $chunks, $section, $title );
	}

	/**
	 * Find semantic boundaries in content (enhanced with code blocks, tables).
	 *
	 * @param  string  $content  Content to analyze.
	 *
	 * @return array Array of boundary positions.
	 * @since 1.3.0
	 */
	private function find_semantic_boundaries( string $content ): array {
		$boundaries = [];
		$pos        = 0;

		// Paragraph boundaries
		while ( ( $pos = mb_strpos( $content, "\n\n", $pos ) ) !== false ) {
			$boundaries[] = $pos + 2;
			$pos          += 2;
		}

		// List items
		$pos = 0;
		while ( ( $pos = mb_strpos( $content, "\n- ", $pos ) ) !== false ) {
			$boundaries[] = $pos + 1;
			$pos          += 1;
		}

		// Numbered lists
		$pos = 0;
		while ( preg_match( '/\n\d+\.\s/', $content, $matches, PREG_OFFSET_CAPTURE, $pos ) ) {
			$offset       = (int) $matches[0][1];
			$boundaries[] = $offset + 1;
			$pos          = $offset + 1;
		}

		// Code blocks
		$pos = 0;
		while ( ( $pos = mb_strpos( $content, "\n```\n", $pos ) ) !== false ) {
			$boundaries[] = $pos + 4;
			$pos          += 4;
		}

		// Table rows (basic)
		$pos = 0;
		while ( ( $pos = mb_strpos( $content, "\n|", $pos ) ) !== false ) {
			$boundaries[] = $pos + 1;
			$pos          += 1;
		}

		sort( $boundaries );

		return array_unique( $boundaries );
	}

	/**
	 * Find sentence boundaries as fallback.
	 *
	 * @param  string  $content  Content to analyze.
	 *
	 * @return array Array of boundary positions.
	 * @since 1.3.0
	 */
	private function find_sentence_boundaries( string $content ): array {
		$boundaries = [];
		$pos        = 0;

		while ( preg_match( '/[.!?]\s+/u', $content, $matches, PREG_OFFSET_CAPTURE, $pos ) ) {
			$offset       = (int) $matches[0][1];
			$length       = mb_strlen( $matches[0][0] );
			$boundaries[] = $offset + $length;
			$pos          = $offset + $length;
		}

		return $boundaries;
	}

	/**
	 * Create a chunk from a section.
	 *
	 * @param  array  $section  Original section.
	 * @param  string  $content  Chunk content.
	 * @param  string  $title  Document title.
	 * @param  int  $chunk_index  Chunk index.
	 *
	 * @return array Chunk array.
	 * @since 1.3.0
	 */
	private function create_chunk_from_section(
		array $section,
		string $content,
		string $title,
		int $chunk_index
	): array {
		$tokens = $this->token_estimator->estimate( $content );

		return [
			'title'        => $section['heading'] ?? $title,
			'heading'      => $section['heading'] ?? $title,
			'heading_path' => $section['heading_path'] ?? $title,
			'content'      => $content,
			'tokens'       => $tokens,
			'chunk_index'  => $chunk_index,
			'path_array'   => $section['path_array'] ?? [ [ 'level' => 1, 'text' => $title ] ],
		];
	}

	/**
	 * Find last safe split point in content.
	 *
	 * @param  string  $content  Content to split.
	 * @param  int  $max_tokens  Maximum tokens for first part.
	 *
	 * @return int Position of last safe split, or 0 if not found.
	 * @since 1.3.0
	 */
	private function find_last_safe_split( string $content, int $max_tokens ): int {
		$boundaries = $this->find_semantic_boundaries( $content );
		if ( empty( $boundaries ) ) {
			$boundaries = $this->find_sentence_boundaries( $content );
		}

		$best_pos = 0;
		foreach ( $boundaries as $pos ) {
			$segment = mb_substr( $content, 0, $pos );
			$tokens  = $this->token_estimator->estimate( $segment );
			if ( $tokens <= $max_tokens ) {
				$best_pos = $pos;
			} else {
				break;
			}
		}

		return $best_pos;
	}

	/**
	 * Force split large content when no good boundaries are found.
	 *
	 * Splits content at character boundaries to ensure no chunk exceeds max_tokens.
	 *
	 * @param  string  $content  Content to split.
	 * @param  array  $section  Original section data.
	 * @param  string  $title  Document title.
	 * @param  int  $start_index  Starting chunk index.
	 *
	 * @return array Array of chunks.
	 * @since 1.2.1
	 */
	private function force_split_large_content( string $content, array $section, string $title, int $start_index ): array {
		$chunks      = [];
		$content_len = mb_strlen( $content );
		$chunk_index = $start_index;
		$pos         = 0;

		// Estimate characters per token (rough estimate: 4 chars per token)
		$chars_per_token = 4;
		$max_chars       = $this->max_tokens * $chars_per_token;

		while ( $pos < $content_len ) {
			$remaining = mb_substr( $content, $pos );
			$remaining_tokens = $this->token_estimator->estimate( $remaining );

			if ( $remaining_tokens <= $this->max_tokens ) {
				// Remaining content fits in one chunk
				$chunk = $this->create_chunk_from_section( $section, trim( $remaining ), $title, $chunk_index );
				$chunks[] = $chunk;
				break;
			}

			// Take a chunk of approximately max_tokens size
			$chunk_text = mb_substr( $content, $pos, $max_chars );
			
			// Try to find a sentence boundary near the end
			$sentence_boundaries = $this->find_sentence_boundaries( $chunk_text );
			if ( ! empty( $sentence_boundaries ) ) {
				// Use the last sentence boundary that's within 80% of max_chars
				$target_pos = (int) ( $max_chars * 0.8 );
				$best_boundary = 0;
				foreach ( $sentence_boundaries as $boundary ) {
					if ( $boundary <= $target_pos ) {
						$best_boundary = $boundary;
					} else {
						break;
					}
				}
				if ( $best_boundary > 0 ) {
					$chunk_text = mb_substr( $content, $pos, $best_boundary );
					$pos += $best_boundary;
				} else {
					// Fallback: split at space near max_chars
					$space_pos = mb_strrpos( $chunk_text, ' ', $max_chars - 100 );
					if ( $space_pos !== false && $space_pos > $max_chars * 0.5 ) {
						$chunk_text = mb_substr( $chunk_text, 0, $space_pos );
						$pos += $space_pos + 1;
					} else {
						$pos += $max_chars;
					}
				}
			} else {
				// No sentence boundaries, split at space
				$space_pos = mb_strrpos( $chunk_text, ' ', $max_chars - 100 );
				if ( $space_pos !== false && $space_pos > $max_chars * 0.5 ) {
					$chunk_text = mb_substr( $chunk_text, 0, $space_pos );
					$pos += $space_pos + 1;
				} else {
					$pos += $max_chars;
				}
			}

			$chunk = $this->create_chunk_from_section( $section, trim( $chunk_text ), $title, $chunk_index++ );
			
			// Final safety check: if chunk still exceeds max_tokens, truncate it
			if ( $chunk['tokens'] > $this->max_tokens ) {
				$chunk = $this->truncate_chunk_to_max_tokens( $chunk );
			}
			
			$chunks[] = $chunk;
		}

		return $chunks;
	}

	/**
	 * Validate and fix chunk sizes to ensure none exceed max_tokens.
	 *
	 * @param  array  $chunks  Chunks to validate.
	 * @param  array  $section  Original section data.
	 * @param  string  $title  Document title.
	 *
	 * @return array Validated chunks.
	 * @since 1.2.1
	 */
	private function validate_and_fix_chunk_sizes( array $chunks, array $section, string $title ): array {
		$validated_chunks = [];
		$chunk_index = 0;

		foreach ( $chunks as $chunk ) {
			$tokens = $chunk['tokens'] ?? $this->token_estimator->estimate( $chunk['content'] ?? '' );

			if ( $tokens > $this->max_tokens ) {
				// Chunk exceeds max_tokens, split it
				$split_chunks = $this->force_split_large_content(
					$chunk['content'],
					$section,
					$title,
					$chunk_index
				);
				$validated_chunks = array_merge( $validated_chunks, $split_chunks );
				$chunk_index += count( $split_chunks );
			} else {
				$chunk['chunk_index'] = $chunk_index++;
				$validated_chunks[] = $chunk;
			}
		}

		return $validated_chunks;
	}

	/**
	 * Truncate a chunk to fit within max_tokens.
	 *
	 * @param  array  $chunk  Chunk to truncate.
	 *
	 * @return array Truncated chunk.
	 * @since 1.2.1
	 */
	private function truncate_chunk_to_max_tokens( array $chunk ): array {
		$content = $chunk['content'];
		$chars_per_token = 4; // Rough estimate
		$max_chars = (int) ( $this->max_tokens * $chars_per_token * 0.95 ); // 95% to be safe

		if ( mb_strlen( $content ) > $max_chars ) {
			// Try to truncate at a sentence boundary
			$truncated = mb_substr( $content, 0, $max_chars );
			$last_period = mb_strrpos( $truncated, '.' );
			$last_exclamation = mb_strrpos( $truncated, '!' );
			$last_question = mb_strrpos( $truncated, '?' );
			
			$best_pos = max( $last_period ?: 0, $last_exclamation ?: 0, $last_question ?: 0 );
			
			if ( $best_pos > $max_chars * 0.7 ) {
				$content = mb_substr( $content, 0, $best_pos + 1 );
			} else {
				// Fallback: truncate at space
				$space_pos = mb_strrpos( $truncated, ' ' );
				if ( $space_pos !== false && $space_pos > $max_chars * 0.7 ) {
					$content = mb_substr( $content, 0, $space_pos );
				} else {
					$content = $truncated;
				}
			}
			
			$chunk['content'] = trim( $content );
			$chunk['tokens'] = $this->token_estimator->estimate( $chunk['content'] );
		}

		return $chunk;
	}

	/**
	 * Add overlap between chunks for context preservation.
	 *
	 * Supports baking into content or metadata.
	 *
	 * @param  array  $chunks  Chunks to add overlap to.
	 *
	 * @return array Chunks with overlap.
	 * @since 1.3.0
	 */
	private function add_overlap_to_chunks( array $chunks ): array {
		if ( count( $chunks ) <= 1 ) {
			return $chunks;
		}

		$overlap_tokens = (int) ( $this->max_tokens * $this->overlap_percentage );

		for ( $i = 0; $i < count( $chunks ); $i ++ ) {
			$chunk = $chunks[ $i ];

			if ( $i > 0 ) {
				$prev_chunk    = $chunks[ $i - 1 ];
				$overlap_start = $this->extract_overlap_text( $prev_chunk['content'], $overlap_tokens, 'end' );
				if ( ! empty( $overlap_start ) ) {
					if ( $this->bake_overlap ) {
						$chunk['content'] = $overlap_start . "\n\n" . $chunk['content'];
						$chunk['tokens']  = $this->token_estimator->estimate( $chunk['content'] );
					} else {
						$chunk['overlap_start'] = $overlap_start;
					}
				}
			}

			if ( $i < count( $chunks ) - 1 ) {
				$next_chunk  = $chunks[ $i + 1 ];
				$overlap_end = $this->extract_overlap_text( $chunk['content'], $overlap_tokens, 'start' );
				if ( ! empty( $overlap_end ) ) {
					if ( $this->bake_overlap ) {
						$next_chunk['content'] .= "\n\n" . $overlap_end;
						$next_chunk['tokens']  = $this->token_estimator->estimate( $next_chunk['content'] );
						$chunks[ $i + 1 ]      = $next_chunk; // Update next
					} else {
						$chunk['overlap_end'] = $overlap_end;
					}
				}
			}

			$chunks[ $i ] = $chunk;
		}

		return $chunks;
	}

	/**
	 * Extract overlap text from content.
	 *
	 * @param  string  $content  Content to extract from.
	 * @param  int  $target_tokens  Target token count for overlap.
	 * @param  string  $position  'start' or 'end'.
	 *
	 * @return string Overlap text.
	 * @since 1.3.0
	 */
	private function extract_overlap_text( string $content, int $target_tokens, string $position ): string {
		if ( empty( $content ) ) {
			return '';
		}

		// Prioritize sentences
		$sentences = preg_split( '/([.!?]\s+)/', $content, - 1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
		if ( count( $sentences ) < 2 ) {
			// Fallback to paragraphs
			$paragraphs = preg_split( '/\n\n+/', $content, - 1, PREG_SPLIT_NO_EMPTY );
			if ( count( $paragraphs ) < 2 ) {
				// Char fallback
				$chars_needed = $this->token_estimator->tokens_to_chars( $target_tokens );

				return $position === 'end' ? mb_substr( $content, - $chars_needed ) : mb_substr( $content,
					0,
					$chars_needed );
			}
			$sentences = $paragraphs;
		}

		$overlap_parts  = [];
		$current_tokens = 0;

		if ( $position === 'end' ) {
			$sentences = array_reverse( $sentences );
			foreach ( $sentences as $sentence ) {
				$sentence = trim( $sentence );
				if ( empty( $sentence ) ) {
					continue;
				}
				$sentence_tokens = $this->token_estimator->estimate( $sentence );
				if ( $current_tokens + $sentence_tokens <= $target_tokens ) {
					array_unshift( $overlap_parts, $sentence );
					$current_tokens += $sentence_tokens;
				} else {
					break;
				}
			}
		} else {
			foreach ( $sentences as $sentence ) {
				$sentence = trim( $sentence );
				if ( empty( $sentence ) ) {
					continue;
				}
				$sentence_tokens = $this->token_estimator->estimate( $sentence );
				if ( $current_tokens + $sentence_tokens <= $target_tokens ) {
					$overlap_parts[] = $sentence;
					$current_tokens  += $sentence_tokens;
				} else {
					break;
				}
			}
		}

		return implode( ' ', $overlap_parts );
	}
}
