'use strict';

module.exports.tokenize = tokenize;

function removeEmoji(str) {
    return str.replace(/([#0-9]\u20E3)|[\xA9\xAE\u203C\u2047-\u2049\u2122\u2139\u3030\u303D\u3297\u3299][\uFE00-\uFEFF]?|[\u2190-\u21FF][\uFE00-\uFEFF]?|[\u2300-\u23FF][\uFE00-\uFEFF]?|[\u2460-\u24FF][\uFE00-\uFEFF]?|[\u25A0-\u25FF][\uFE00-\uFEFF]?|[\u2600-\u27BF][\uFE00-\uFEFF]?|[\u2900-\u297F][\uFE00-\uFEFF]?|[\u2B00-\u2BF0][\uFE00-\uFEFF]?|(?:\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDEFF])[\uFE00-\uFEFF]?/g, '');
}

// Split queries based on other ascii and unicode punctuation.
const WORD_SEPARATOR = [
    // Equivalient to \u0020\f\n\r\t\v\u00A0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\uFEFF
    '\\s',

    // \u2000 - \u206F "General Punctuation"
    '\u2000-\u206F',

    // \u2E00 - \u2E7F "Supplemental Punctuation"
    '\u2E00-\u2E7F',

    // The usual suspects from \u0020 - \u007F "Basic Latin"
    // !"#$%&'()*+-./,
    '\u0021-\u002F',
    // :;<=>?@
    '\u003A-\u0040',
    // [\]^_`
    '\u005B-\u0060',
    // {|}~
    '\u007B-\u007E',

    // Similar symbols from \uFF00 - \uFFEF "Halfwidth and Fullwidth Forms"
    '\uFF01-\uFF0F',
    '\uFF1A-\uFF20',
    '\uFF3B-\uFF40',
    '\uFF5B-\uFF65'

].join('');
module.exports.WORD_SEPARATOR = WORD_SEPARATOR;

function tokenize(query, lonlat) {
    if (lonlat) throw new Error('Unsupported usage of tokenize. Use asReverse instead');

    const tokens = [];
    const separators = [];

    const normalized = query
        .toLowerCase()
        // collapse apostraphes, periods, caret
        .replace(/[\u2018\u2019\u02BC\u02BB\uFF07'\.\^]/g, '')
        // If the query begins with a separators, tear it off.
        .replace(new RegExp(`^[${WORD_SEPARATOR}]+`, 'u'), '');

    const split = new RegExp(`([^${WORD_SEPARATOR}]+)([${WORD_SEPARATOR}]+|$)`, 'yu');
    let part;
    let tail;
    // eslint-disable-next-line no-cond-assign
    while (part = split.exec(normalized)) {
        let t = part[1].toString();
        const s = part[2].toString();

        if (tail) {
            if (tail.s === '-' || tail.s === '/') {
                const combined = `${tail.t}${tail.s}${t}`;
                // Allow numbers like 1-2, 1/2, 1a, 1-2a, 1/2a, 1/2-3b
                if  (/^(\d+)(-|\/)(\d+)((-|\/)(\d+))?[a-z]?$/.test(combined)) {
                    t = combined;
                } else {
                    tokens.push(tail.t);
                    separators.push(tail.s);
                }
            } else {
                tokens.push(tail.t);
                separators.push(tail.s);
            }
        }
        tail = false;

        if (t.length === 0) continue;
        if (removeEmoji(t).length === 0) continue;

        // \u4E00 - \u9FFF "CJK Unified Ideographs" characters are indexed
        // individually to support addresses being written from largest to
        // smallest geographical entity without delimiters. Adjacent numbers,
        // normal and full-width, are not split.
        const subtoken = t.split(/([\u4E00-\u9FFF])/u);
        if (subtoken.length > 1) {
            for (let l = 0; l < subtoken.length; l++) {
                if (subtoken[l].length > 0) {
                    tokens.push(subtoken[l]);
                    separators.push('');
                }
            }
            continue;
        }

        // In some cases we want to combine two tokens.
        if (s === '-' || s === '/') {
            tail = { t, s };
        } else {
            tokens.push(t);
            separators.push(s);
        }
    }

    if (tail) {
        tokens.push(tail.t);
        separators.push(tail.s);
    }

    const owner = new Array(tokens.length);
    for (let i = 0; i < owner.length; i++) owner[i] = i;

    return { tokens, separators, owner, lastWord: false };
}