<?php

namespace CocktailRecipes\Core\Helpers;

use Normalizer;

final class Sanitizer
{
    // Escape character, i.e. ASCII ESC
    public const ESC = "\x1B";

    /**
     * Cleanup content provided by WordPress
     *
     * Conversions done:
     * 1. HTML entities decoded to corresponding chars
     * 2. <br> and <p> tags converted to LF
     * 3. Other HTML tags removed (optional)
     * 4. Text normalized (i.e. backslashes, dash/hyphen, period, spaces, etc.) by normalizeText()
     * 5. Consecutive LF chars collapsed (optional)
     * 6. Whitespace trimmed from start/end
     *
     * Options:
     *   'fix_breaks'     => bool           convert <pre>/<p>/<br> tags to LF (default true)
     *   'strip_tags'     => bool           remove HTML tags (default false)
     *   'strip_controls' => bool           remove unexpected control chars (default true)
     *   'escapes'        => bool           backslashes to ASCII ESC chars or literals (default true)
     *   'dash'           => string | null  dash/hyphen conversion (default '-'); null for as-is
     *   'dots'           => bool           period/dot conversion (default true)
     *   'tab'            => string | null  tab conversion (default ' '); null for as-is
     *   'nbsp'           => string | null  no-break spaces (default ' '); null for as-is
     *   'collapse_space' => bool           collapse consecutive ASCII spaces (default false)
     *   'collapse_lf'    => bool           collapse consecutive LF (default true)
     *
     * @see Sanitizer::normalizeText()
     */
    public static function cleanContent(string $content, array $options = []): string
    {
        // Ensure valid UTF-8
        $content = wp_check_invalid_utf8($content, true);

        // Decode HTML entities like &#8211; or &ndash; into their actual characters
        $content = html_entity_decode($content, ENT_QUOTES | ENT_HTML5, 'UTF-8');

        // Convert WordPress inserted and other <pre>, <p> and/or <br> tags in content
        if ($options['fix_breaks'] ?? true) {
            // paragraph boundaries
            $content = preg_replace('/[ \t]*<\/p>\s*<p>[ \t]*\r?\n?/i', "\n\n", $content);
            // line breaks and other block-related breaks
            $content = preg_replace([
                '/[ \t]*<\/?pre\b[^>]*>[ \t]*\r?\n?/i', // <pre> or </pre>
                '/[ \t]*<br\s*\/?>[ \t]*\r?\n?/i',      // <br> or <br/>
                '/[ \t]*<\/?p>[ \t]*\r?\n?/i',          // stray <p> or </p>
            ], "\n", $content);
        }

        // Strip HTML tags
        if ($options['strip_tags'] ?? false) {
            $content = wp_strip_all_tags($content, false);
        }

        // Normalize Unicode dashes, periods, whitespaces and decomposed accents
        $content = self::normalizeText($content, $options);

        // Collapse multiple LF chars
        if ($options['collapse_lf'] ?? true) {
            $content = preg_replace("/\n{2,}/", "\n", $content);
        }

        return trim($content);
    }

    /**
     * Normalize text (backlslashes, dashes, periods, whitespaces and decomposed accents)
     *
     * Conversions done:
     * 1. Backslashes normalized by decodeEsc()
     * 2. Dashes/hyphens normalized by normalizeDashes()
     * 3. Periods/dots normalized by normalizePeriods()
     * 4. Whitespaces normalized by normalizeSpaces()
     * 5. Decomposed Unicode accents normalized (NFD --> NFC)
     *
     * Options:
     *   'strip_controls' => bool           remove unexpected control chars (default true)
     *   'escapes'        => bool           backslashes to ASCII ESC chars or literals (default true)
     *   'dash'           => string | null  dash/hyphen conversion (default '-'); null for as-is
     *   'dots'           => bool           period/dot conversion (default true)
     *   'tab'            => string | null  tab conversion (default ' '); null for as-is
     *   'nbsp'           => string | null  no-break spaces (default ' '); null for as-is
     *   'collapse_space' => bool           collapse consecutive ASCII spaces (default false)
     *
     * @see Sanitizer::decodeEsc()
     * @see Sanitizer::normalizeDashes()
     * @see Sanitizer::normalizePeriods()
     * @see Sanitizer::normalizeSpaces()
     */
    public static function normalizeText(string $text, array $options = []): string
    {
        // Remove unexpected control characters
        if ($options['strip_controls'] ?? true) {
            $text = preg_replace('/[\x00-\x08\x0E-\x1F\x7F]/', '', $text);
        }

        // Normalize backslashes, dashes, periods and spaces
        if ($options['escapes'] ?? true) {
            $text = self::decodeEsc($text);
        }
        if (($dash = $options['dash'] ?? '-') !== null) {
            $text = self::normalizeDashes($text, $dash);
        }
        if ($options['dots'] ?? true) {
            $text = self::normalizePeriods($text);
        }
        $text = self::normalizeSpaces($text, $options);

        // Normalize Unicode accents (NFD to NFC)
        if (class_exists('Normalizer')) {
            $text = Normalizer::normalize($text, Normalizer::FORM_C);
        }
        return $text;
    }

    /** Converts backslashes into ASCII ESC characters or literal backslashes */
    public static function decodeEsc(string $text): string
    {
        return strtr(str_replace(self::ESC, '', $text), ['\\\\' => '\\', '\\' => self::ESC]);
    }

    /** Remove ASCII ESC placeholders previously inserted by decodeEsc() */
    public static function stripEsc(string $text): string
    {
        return str_replace(self::ESC, '', $text);
    }

    /**
     * Normalize dashes (Unicode dash and hyphen variants)
     *
     * Converts all dash-like punctuation to a standard ASCII hyphen-minus or
     * caller-provided replacement.
     */
    public static function normalizeDashes(string $text, string $replace = '-'): string
    {
        // Ensure UTF-8 encoding (not needed as WordPress-sourced input is UTF-8)
        #if (!mb_check_encoding($text, 'UTF-8')) {
        #    $text = mb_convert_encoding($text, 'UTF-8', 'auto');
        #}

        // Normalize Unicode dash variants to ASCII hyphen/minus (unless preceded by ESC char)
        // @todo future - if we ever have a use case for it, we could add option to ignore ESC chars
        //   U+2010 = Hyphen
        //   U+2011 = Non-Breaking Hyphen
        //   U+2012 = Figure Dash
        //   U+2013 = En Dash
        //   U+2014 = Em Dash
        //   U+2015 = Horizontal Bar
        //   U+2043 = Hyphen Bullet
        //   U+2212 = Minus Sign
        //   U+FE58 = Small Em Dash
        //   U+FE63 = Small Hyphen-Minus
        //   U+FF0D = Fullwidth Hyphen-Minus
        return preg_replace(
            '/(?<!\x1B)[\x{2010}-\x{2015}\x{2043}\x{2212}\x{FE58}\x{FE63}\x{FF0D}]/u',
            $replace,
            $text
        );
    }

    /**
     * Normalize periods/dots
     *
     * Convert period/dot-like characters when used as periods within or at the end
     * of words into ASCII period.
     */
    public static function normalizePeriods(string $text): string
    {
        // Ensure UTF-8 encoding (not needed as WordPress-sourced input is UTF-8)
        #if (!mb_check_encoding($text, 'UTF-8')) {
        #    $text = mb_convert_encoding($text, 'UTF-8', 'auto');
        #}

        // Period-like characters
        //   U+2024 = One Dot Leader        visually identical to periods; can be mid-letter or end-of-word
        //
        // Middle dot-like characters
        //   U+00B7 = Middle Dot            sometimes used as stylistic separator; e.g. m·l
        //   U+2027 = Hyphenation Point     sometimes used between syllables or names; e.g. m‧l
        //   U+2219 = Bullet Operator       math or pseudo-bullet character; m∙l
        //
        // Dots with semantic purposes and/or non-punctuation
        //   U+0387 = Greek Ano Teleia                      Greek semicolon; semantic punctuation
        //   U+1427 = Canadian Syllabics Final Middle Dot   Linguistic orthography marker; not punctuation
        //   U+2022 = Bullet                                List bullet; not punctuation
        //   U+22C5 = Dot Operator                          Math operator
        //   U+30FB = Katakana Middle Dot                   Japanese name separator; semantically distinct

        // Normalize mid-letter dots; e.g. "m·l", "St·Germain"
        // Note: We do not need negative lookbehinds for ESC in these two regex patterns because we only
        //       match if preceded by a letter. Thus we can easily escape any of these with a backslash.
        $text = preg_replace('/(?<=\p{L})[\x{00B7}\x{2024}\x{2027}\x{2219}](?=\p{L})/u', '.', $text);

        // End-of-word abbreviation dots
        $text = preg_replace('/(?<=\p{L})[\x{2024}](?=\b|\s|[.,:;!?\-()\[\]]|$)/u', '.', $text);

        return $text;
    }

    /**
     * Normalize whitespace
     *
     * Conversions done:
     * 1. Unicode space-like characters converted to ASCII space
     * 2. CR/LF, CR and Unicode line/paragraph separators converted to LF
     * 3. FF, VT converted to ASCII space
     * 4. Invisible/zero-width chars and NUL removed
     * 5. Tabs converted to ASCII space (optional/configurable)
     * 6. No-break spaces converted to ASCII space (optional/configurable)
     * 7. Consecutive ASCII spaces collapsed (optional)
     *
     * Options:
     *   'tab'            => string | null  tab conversion (default ' '); null for as-is
     *   'nbsp'           => string | null  no-break spaces (default ' '); null for as-is
     *   'collapse_space' => bool           collapse consecutive ASCII spaces (default false)
     */
    public static function normalizeSpaces(string $text, array $options = []): string
    {
        // Ensure UTF-8 encoding (not needed as WordPress-sourced input is UTF-8)
        #if (!mb_check_encoding($text, 'UTF-8')) {
        #    $text = mb_convert_encoding($text, 'UTF-8', 'auto');
        #}

        // Normalize uncommon Unicode spaces (unless preceded by ESC char)
        // @todo future - if we ever have a use case for it, we could add option to ignore ESC chars
        //   U+1680 = Ogham Space Mark
        //   U+2000 = En Quad
        //   U+2001 = Em Quad
        //   U+2002 = En Space
        //   U+2003 = Em Space
        //   U+2004 = Three-Per-Em Space
        //   U+2005 = Four-Per-Em Space
        //   U+2006 = Six-Per-Em Space
        //   U+2007 = Figure Space
        //   U+2008 = Punctuation Space
        //   U+2009 = Thin Space
        //   U+200A = Hair Space
        //   U+205F = Medium Mathematical Space (MMSP)
        //   U+3000 = Ideographic Space
        $text = preg_replace('/(?<!\x1B)[\x{1680}\x{2000}-\x{200A}\x{205F}\x{3000}]/u', ' ', $text);

        // Convert CR+LF, CR and Unicode line/paragraph separators into LF
        $text = str_replace([
            "\r\n",             // CR+LF
            "\r",               // CR
            "\xE2\x80\xA8",     // U+2028 = Line Separator
            "\xE2\x80\xA9",     // U+2029 = Paragraph Separator
            "\xC2\x85",         // U+0085 = <Next Line> (NEL)
        ], "\n", $text);

        // Convert FF and VT into spaces
        $text = str_replace(["\f", "\v"], ' ', $text);

        // Remove zero-width and invisible characters, plus ASCII NUL
        $text = str_replace([
            "\xE2\x80\x8B",     // U+200B = Zero Width Space (ZWSP)
            "\xC2\xAD",         // U+00AD = Soft Hyphen (SHY)
            "\xE1\xA0\x8E",     // U+180E = Mongolian Vowel Separator (MVS)
            "\x00"              // U+0000 = <Null> (NUL)
        ], '', $text);

        // Convert tabs (optional)
        if (($tab = $options['tab'] ?? ' ') !== null) {
            $text = str_replace("\t", $tab, $text);
        }

        // Convert non-breaking spaces (optional)
        if (($nbsp = $options['nbsp'] ?? ' ') !== null) {
            $text = str_replace([
                "\xC2\xA0",     // U+00A0 = No-Break Space (NBSP)
                "\xE2\x80\xAF"  // U+202F = Narrow No-Break Space (NNBSP)
            ], $nbsp, $text);
        }

        // Collapse multiple spaces (optional)
        if ($options['collapse_space'] ?? false) {
            $text = preg_replace('/ {2,}/', ' ', $text);
        }

        return $text;
    }

    /**
     * Strip any `// comments` from text
     *
     * Removes the first '//' comment on the line. To avoid stripping URLs, inline comments
     * must be preceded by a whitespace, and not immediately followed by a domain. You can
     * also precede the comment with a backslash to escape it, e.g. `\//not a comment`.
     */
    public static function stripComments(string $text): string
    {
        // @todo future - if we ever have a use case for it, we could add option to ignore ESC chars
        $pos = 0;
        while (($pos = strpos($text, '//', $pos)) !== false) {
            // Only strip if not escaped, nor '://' or '//domain.com'
            if (
                // require `//` to be at start of line or have whitespace before it
                (!$pos || ctype_space($text[$pos - 1]))
                // also require a domain name not to immediately follow it
                && !preg_match('#//[\p{L}0-9-]+\.[\p{L}0-9-]+#Au', $text, $m, 0, $pos)
            ) {
                return substr($text, 0, $pos);
            }
            $pos += 2;
        }
        return $text;
    }
}
