<?php
/**
 * Content Analyzer for GFMR plugin
 *
 * Analyzes post content to determine the most appropriate Schema.org type
 * (Article, TechArticle, or HowTo).
 *
 * @package WpGfmRenderer
 * @since 1.4.0
 */

namespace Wakalab\WpGfmRenderer;

// Prevent direct access.
if ( ! defined( 'ABSPATH' ) ) {
	exit;
}

// Prevent class redeclaration when both Free and Pro versions are active
if ( class_exists( __NAMESPACE__ . '\\GFMR_Content_Analyzer' ) ) {
	return;
}

/**
 * Content Analyzer class
 *
 * Analyzes content to detect article type for schema generation.
 */
class GFMR_Content_Analyzer {

	/**
	 * Technical keywords for TechArticle detection
	 *
	 * @var array
	 */
	private $tech_keywords = array(
		'API',
		'関数',
		'クラス',
		'メソッド',
		'コード',
		'プログラミング',
		'function',
		'class',
		'method',
		'programming',
		'algorithm',
	);

	/**
	 * Detect the article type based on content analysis
	 *
	 * @param string $content Post content.
	 * @return string Article type (Article, TechArticle, or HowTo).
	 */
	public function detect_article_type( $content ) {
		// Remove code blocks before checking for HowTo patterns
		// to avoid false positives from step comments in code.
		$content_without_code = $this->remove_code_blocks( $content );

		// Check for HowTo patterns first (higher priority).
		if ( $this->is_how_to( $content_without_code ) ) {
			return 'HowTo';
		}

		// Check for TechArticle patterns (considers code block count).
		if ( $this->is_tech_article( $content ) ) {
			return 'TechArticle';
		}

		return 'Article';
	}

	/**
	 * Remove code blocks from content
	 *
	 * Removes both Markdown fenced code blocks and HTML pre/code tags.
	 *
	 * @param string $content Content to process.
	 * @return string Content with code blocks removed.
	 */
	public function remove_code_blocks( $content ) {
		// Remove Markdown fenced code blocks (```...```).
		$content = preg_replace( '/```[\s\S]*?```/m', '', $content );

		// Remove HTML code blocks (<pre>...</pre>).
		$content = preg_replace( '/<pre[^>]*>.*?<\/pre>/s', '', $content );

		return $content;
	}

	/**
	 * Check if content is a technical article
	 *
	 * Returns true if content has 3+ code blocks or contains technical keywords.
	 *
	 * @param string $content Post content.
	 * @return bool True if content is technical.
	 */
	public function is_tech_article( $content ) {
		// Count code blocks (Markdown fenced and HTML pre).
		$fenced_count = preg_match_all( '/```/', $content );
		$pre_count    = preg_match_all( '/<pre[^>]*>/', $content );

		// Fenced blocks are pairs, so divide by 2.
		$code_block_count = ( $fenced_count / 2 ) + $pre_count;

		if ( $code_block_count >= 3 ) {
			return true;
		}

		// Check for technical keywords (excluding code blocks).
		$content_without_code = $this->remove_code_blocks( $content );

		foreach ( $this->tech_keywords as $keyword ) {
			if ( false !== stripos( $content_without_code, $keyword ) ) {
				return true;
			}
		}

		return false;
	}

	/**
	 * Check if content is a HowTo article
	 *
	 * Returns true if content contains step patterns or method patterns.
	 *
	 * @param string $content Post content (should have code blocks removed).
	 * @return bool True if content is a HowTo.
	 */
	public function is_how_to( $content ) {
		$howto_patterns = array(
			// Japanese step patterns.
			'/手順\s*[0-9]*[:：]/u',
			'/ステップ\s*[0-9]*[:：]/u',
			// English step patterns.
			'/Step\s*[0-9]*[:：]/i',
			// Japanese "how to" patterns - using actual Unicode characters.
			'/する方法/u',
			'/のやり方/u',
		);

		foreach ( $howto_patterns as $pattern ) {
			if ( preg_match( $pattern, $content ) ) {
				return true;
			}
		}

		return false;
	}

	/**
	 * Extract HowTo steps from content
	 *
	 * Extracts step items from h2/h3 headings or step patterns.
	 *
	 * @param string $content Post content.
	 * @return array Array of step data.
	 */
	public function extract_how_to_steps( $content ) {
		$steps = array();

		// Match Japanese/English step patterns with their text.
		$patterns = array(
			'/(?:手順|ステップ|Step)\s*([0-9]+)[:：]\s*(.+?)(?=\n|$)/iu',
		);

		foreach ( $patterns as $pattern ) {
			if ( preg_match_all( $pattern, $content, $matches, PREG_SET_ORDER ) ) {
				foreach ( $matches as $match ) {
					$steps[] = array(
						'position' => (int) $match[1],
						'name'     => trim( $match[2] ),
					);
				}
			}
		}

		// Also check for h2/h3 headings that look like steps.
		if ( preg_match_all( '/<h[23][^>]*>(.+?)<\/h[23]>/i', $content, $matches ) ) {
			$position = 1;
			foreach ( $matches[1] as $heading ) {
				// Check if heading contains step-like pattern.
				if ( preg_match( '/^(?:手順|ステップ|Step)\s*[0-9]*[:：]?\s*/iu', $heading ) ) {
					$name = preg_replace( '/^(?:手順|ステップ|Step)\s*[0-9]*[:：]?\s*/iu', '', $heading );
					if ( ! empty( $name ) ) {
						$steps[] = array(
							'position' => $position++,
							'name'     => trim( wp_strip_all_tags( $name ) ),
						);
					}
				}
			}
		}

		return $steps;
	}
}
