<?php

namespace CocktailRecipes\Core\Helpers;

final class Text
{
    /** Quote opening and matching closing characters */
    private const QUOTE_PAIRS = [
        // U+2018/2019  left/right single quotation marks
        // U+201C/201D  left/right double quotation marks
        // U+201A       single low-9 quotation mark
        // U+201B       single high-reversed-9 quotation mark
        // U+201E       double low-9 quotation mark
        // U+201F       double high-reversed-9 quotation mark
        // U+00AB/00BB  left/right-pointing double angle quotation marks
        // U+2039/203A  single left/right-pointing angle quotation marks
        // U+2E42       double low-reversed-9 quotation mark (legacy)

        // Single/double ASCII quotes
        "'" => "'\u{2019}\u{201B}",
        '"' => "\"\u{201D}\u{201F}",

        // English smart quotes
        "\u{2018}" => "\u{2019}\u{201B}",           // left single + right/reversed
        "\u{2019}" => "\u{2019}",                   // right single
        "\u{201C}" => "\u{201D}\u{201F}",           // left double + right/double-high
        "\u{201D}" => "\u{201D}",                   // right double quotes

        // Angle quotes
        "\u{00AB}" => "\u{00BB}",                   // double angle quotes
        "\u{2039}" => "\u{203A}",                   // single angle quotes

        // Low-9 opening quotes (common in de/pl)
        "\u{201A}"   => "\u{2018}\u{2019}",         // low single + single
        "\u{201E}"   => "\u{201C}\u{201D}\u{2E42}", // low double + double
    ];

    /** True if text is a valid identifier */
    public static function isIdentifier(string $text): bool
    {
        return preg_match('/^[a-z][a-z0-9]*(_[a-z0-9]+)*$/i', $text);
    }

    /** Get first character(s) from text */
    public static function first(string $text, int $len = 1): string
    {
        return ($text != '') ? mb_substr($text, 0, $len, 'UTF-8') : '';
    }

    /** Get last character(s) from text */
    public static function last(string $text, int $len = 1): string
    {
        return ($text != '') ? mb_substr($text, -$len, $len, 'UTF-8') : '';
    }

    /** UTF-8 safe ucfirst() */
    public static function ucFirst(string $text): string
    {
        return mb_strtoupper(mb_substr($text, 0, 1, 'UTF-8'), 'UTF-8')
            . mb_substr($text, 1, null, 'UTF-8');
    }

    /** Adjust target to match source case if its upper or ucfirst */
    public static function matchCase(string $src, string $target): string
    {
        $first = mb_substr($src, 0, 1, 'UTF-8');
        if ($first != mb_strtoupper($first, 'UTF-8')) return $target;
        return $src == mb_strtoupper($src, 'UTF-8')
            ? mb_strtoupper($target, 'UTF-8')
            : self::ucFirst($target);
    }

    /** Remove matching quotes around text */
    public static function unquote(string $text): string
    {
        // @todo future - if we ever have a use case for it, we could add option to ignore ESC char
        if (
            mb_strlen($text, 'UTF-8') >= 2
            && ($closing = self::QUOTE_PAIRS[self::first($text)] ?? null)
            && mb_strpos($closing, self::last($text), 0, 'UTF-8') !== false
            && mb_substr($text, -2, 1, 'UTF-8') !== Sanitizer::ESC
        ) {
           return mb_substr($text, 1, -1, 'UTF-8');
        }
        return $text;
    }

    /**
     * Convert text into a normalized token
     *
     * Conversions done:
     * 1. Text (backslashes, dash/hyphen, period, spaces, etc.) normalized (optional)
     * 2. Leading/trailing whitespaces trimmed
     * 3. Lowercased (optionally bypassed)
     * 4. Quotes around text removed
     * 5. Periods removed from abbreviations
     * 6. Whitespaces converted to spaces and collapsed
     * 7. Word connectors ('&', '+', '-', '/') converted to '_'
     * 8. Apostrophes in words removed
     * 9. Spaces converted to underscores ('_')
     *
     * Options:
     *   'normalize' => bool    Unicode text normalization (default false)
     *   'lower'     => bool    Convert to lowercase (default true)
     *
     * @see Sanitizer::normalizeText() for text normalization options
     */
    public static function toToken(string $text, array $options = []): string
    {
        // Normalize dashes/hyphens, periods/dots, whitespaces, etc. (optional)
        // Note: Not necessary if text was first passed through Sanitize:cleanContent()
        if ($options['normalize'] ?? false) {
            $text = Sanitizer::normalizeText($text, $options);
        }

        // Trim and lowercase
        if (($text = trim($text)) == '') return '';
        if ($options['lower'] ?? true) {
            $text = mb_strtolower($text, 'UTF-8');
        }

        // Remove quotes
        if (($text = self::unquote($text)) == '') return '';

        // Convert periods as abbreviations; e.g. "St. George Gin" --> "St George Gin"
        $text = rtrim(preg_replace('/(?<=\p{L})\.(?=(\p{L}|\s|$))/u', ' ', $text));

        // Convert and collapse whitespaces within text
        // Note: 'u' modifier omitted on assumption spaces were already normalized
        $text = preg_replace('/\s+/', ' ', $text);

        // convert '&', '+', '-' or '/' between words to '_'
        $text = preg_replace('/(?<=[\p{L}])\s?[-&+\/]\s?(?=[\p{L}])/u', '_', $text);

        // Remove apostrophe's in words; e.g. "jack daniel's" --> "jack daniels"
        //  U+2019  right single quotation mark (most common "smart" apostrophe)
        //  U+2018  left single quotation mark
        //  U+02BC  modifier letter apostrophe (used in some fonts/languages)
        //  U+201B  reversed single quotation mark (rare)
        $text = preg_replace("/(?<=\p{L})['\x{2019}\x{2018}\x{02BC}\x{201B}](?=\p{L})/u", '', $text);

        // Convert spaces to underscores
        $text = str_replace(' ', '_', $text);

        return Sanitizer::stripEsc($text);
    }
}
