'''
scrape_nominatim_special_phrases.py
-----------------------------------

Simple script to scrape https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases
for category-related phrases sometimes found in geocoder input.

Populates a per-language CSV with (phrase, OSM key, OSM value, plural):

OSM keys/values are like:

amenity=restaurant
tourism=museum
shop=books

Using these phrases, it is possible to construct queries like "restaurants in Brooklyn"
'''

import csv
import os
import re
import requests
import six
import sys
import time

this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))

from geodata.encoding import safe_decode, safe_encode

DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
                                      'resources', 'categories')


# Use Special:Export to get wiki markup
WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')

phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')

IGNORE_LANGUAGES = {
    # Interlingua
    'ia'
}


IGNORE_PLURAL_LANGUAGES = {
    # For Japanese, seems to just put an s on the end, which doesn't seem right
    # Need input from a native speaker on that one
    'ja',
}

# Wait this many seconds between page fetches
POLITENESS_DELAY = 5.0


def scrape_nominatim_category_page(url, ignore_plurals=False):
    result = requests.get(url)

    if not result or not result.content:
        return

    for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
        if operator and operator != '-':
            continue

        is_plural = plural == 'Y'
        if is_plural and ignore_plurals:
            continue

        yield safe_decode(phrase).lower(), key, value, is_plural


def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
    print('Fetching main page')
    result = requests.get(url)
    languages = {}
    if not result or not result.content:
        return languages

    time.sleep(POLITENESS_DELAY)

    for entity, anchor_text in wiki_link_re.findall(result.content):
        if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
            continue

        lang = entity.rstrip('/').rsplit('/')[-1].lower()
        if lang in IGNORE_LANGUAGES:
            continue

        link = WIKI_BASE_URL + entity.replace(' ', '_')

        ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES

        print('Doing {}'.format(lang))
        phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
        time.sleep(POLITENESS_DELAY)

        if not phrases:
            continue

        languages[lang] = phrases

    return languages


def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
    languages = scrape_all_nominatim_category_pages(url=url)
    for lang, phrases in six.iteritems(languages):
        filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
        with open(filename, 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(('key', 'value', 'is_plural', 'phrase'))

            for phrase, key, value, is_plural in phrases:
                writer.writerow((safe_encode(key), safe_encode(value),
                                str(int(is_plural)), safe_encode(phrase)))

    print('Done')

if __name__ == '__main__':
    main()