# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import numpy as np
import pytest

import megengine as mge
import megengine.functional as F
from megengine import Tensor, jit, random
from megengine.core._imperative_rt import CompNode
from megengine.core._imperative_rt.core2 import apply
from megengine.core._imperative_rt.ops import (
    delete_rng_handle,
    get_global_rng_seed,
    new_rng_handle,
)
from megengine.core.autodiff.grad import Grad
from megengine.core.ops.builtin import (
    BetaRNG,
    GammaRNG,
    GaussianRNG,
    PermutationRNG,
    PoissonRNG,
    UniformRNG,
)
from megengine.device import get_device_count
from megengine.jit import trace
from megengine.random import RNG
from megengine.random import seed as set_global_seed
from megengine.random import uniform


@pytest.mark.skipif(
    get_device_count("xpu") <= 2, reason="xpu counts need > 2",
)
def test_gaussian_op():
    # FIXME: remove this sync
    mge.core.set_option("async_level", 0)
    set_global_seed(1024)
    shape = (
        8,
        9,
        11,
        12,
    )
    shape = Tensor(shape, dtype="int32")
    op = GaussianRNG(seed=get_global_rng_seed(), mean=1.0, std=3.0, dtype="float32")
    (output,) = apply(op, shape)
    assert np.fabs(output.numpy().mean() - 1.0) < 1e-1
    assert np.fabs(np.sqrt(output.numpy().var()) - 3.0) < 1e-1
    assert str(output.device) == str(CompNode("xpux"))
    assert output.dtype == np.float32

    cn = CompNode("xpu2")
    seed = 233333
    h = new_rng_handle(cn, seed)
    op = GaussianRNG(seed=seed, mean=3.0, std=1.0, dtype="float32", handle=h)
    (output,) = apply(op, shape)
    delete_rng_handle(h)
    assert np.fabs(output.numpy().mean() - 3.0) < 1e-1
    assert np.fabs(np.sqrt(output.numpy().var()) - 1.0) < 1e-1
    assert str(output.device) == str(cn)
    assert output.dtype == np.float32


@pytest.mark.skipif(
    get_device_count("xpu") <= 2, reason="xpu counts need > 2",
)
def test_uniform_op():
    set_global_seed(1024)
    shape = (
        8,
        9,
        11,
        12,
    )
    shape = Tensor(shape, dtype="int32")
    op = UniformRNG(seed=get_global_rng_seed(), dtype="float32")
    (output,) = apply(op, shape)
    assert np.fabs(output.numpy().mean() - 0.5) < 1e-1
    assert str(output.device) == str(CompNode("xpux"))
    assert output.dtype == np.float32

    cn = CompNode("xpu2")
    seed = 233333
    h = new_rng_handle(cn, seed)
    op = UniformRNG(seed=seed, dtype="float32", handle=h)
    (output,) = apply(op, shape)
    delete_rng_handle(h)
    assert np.fabs(output.numpy().mean() - 0.5) < 1e-1
    assert str(output.device) == str(cn)
    assert output.dtype == np.float32


@pytest.mark.skipif(
    get_device_count("xpu") <= 2, reason="xpu counts need > 2",
)
def test_gamma_op():
    set_global_seed(1024)
    _shape, _scale = 2, 0.8
    _expected_mean, _expected_std = _shape * _scale, np.sqrt(_shape) * _scale

    shape = F.full([8, 9, 11, 12], value=_shape, dtype="float32")
    scale = F.full([8, 9, 11, 12], value=_scale, dtype="float32")
    op = GammaRNG(seed=get_global_rng_seed(), handle=0)
    (output,) = apply(op, shape, scale)
    assert np.fabs(output.numpy().mean() - _expected_mean) < 1e-1
    assert np.fabs(np.sqrt(output.numpy().var()) - _expected_std) < 1e-1
    assert str(output.device) == str(CompNode("xpux"))

    cn = CompNode("xpu2")
    seed = 233333
    h = new_rng_handle(cn, seed)
    shape = F.full([8, 9, 11, 12], value=_shape, dtype="float32", device="xpu2")
    scale = F.full([8, 9, 11, 12], value=_scale, dtype="float32", device="xpu2")
    op = GammaRNG(seed=seed, handle=h)
    (output,) = apply(op, shape, scale)
    delete_rng_handle(h)
    assert np.fabs(output.numpy().mean() - _expected_mean) < 1e-1
    assert np.fabs(np.sqrt(output.numpy().var()) - _expected_std) < 1e-1
    assert str(output.device) == str(cn)


@pytest.mark.skipif(
    get_device_count("xpu") <= 2, reason="xpu counts need > 2",
)
def test_beta_op():
    set_global_seed(1024)
    _alpha, _beta = 2, 0.8
    _expected_mean = _alpha / (_alpha + _beta)
    _expected_std = np.sqrt(
        _alpha * _beta / ((_alpha + _beta) ** 2 * (_alpha + _beta + 1))
    )

    alpha = F.full([8, 9, 11, 12], value=_alpha, dtype="float32")
    beta = F.full([8, 9, 11, 12], value=_beta, dtype="float32")
    op = BetaRNG(seed=get_global_rng_seed())
    (output,) = apply(op, alpha, beta)
    assert np.fabs(output.numpy().mean() - _expected_mean) < 1e-1
    assert np.fabs(np.sqrt(output.numpy().var()) - _expected_std) < 1e-1
    assert str(output.device) == str(CompNode("xpux"))

    cn = CompNode("xpu2")
    seed = 233333
    h = new_rng_handle(cn, seed)
    alpha = F.full([8, 9, 11, 12], value=_alpha, dtype="float32", device=cn)
    beta = F.full([8, 9, 11, 12], value=_beta, dtype="float32", device=cn)
    op = BetaRNG(seed=seed, handle=h)
    (output,) = apply(op, alpha, beta)
    delete_rng_handle(h)
    assert np.fabs(output.numpy().mean() - _expected_mean) < 1e-1
    assert np.fabs(np.sqrt(output.numpy().var()) - _expected_std) < 1e-1
    assert str(output.device) == str(cn)


@pytest.mark.skipif(
    get_device_count("xpu") <= 2, reason="xpu counts need > 2",
)
def test_poisson_op():
    set_global_seed(1024)
    lam = F.full([8, 9, 11, 12], value=2, dtype="float32")
    op = PoissonRNG(seed=get_global_rng_seed())
    (output,) = apply(op, lam)
    assert np.fabs(output.numpy().mean() - 2.0) < 1e-1
    assert np.fabs(np.sqrt(output.numpy().var()) - np.sqrt(2.0)) < 1e-1
    assert str(output.device) == str(CompNode("xpux"))

    cn = CompNode("xpu2")
    seed = 233333
    h = new_rng_handle(cn, seed)
    lam = F.full([8, 9, 11, 12], value=2, dtype="float32", device=cn)
    op = PoissonRNG(seed=seed, handle=h)
    (output,) = apply(op, lam)
    delete_rng_handle(h)
    assert np.fabs(output.numpy().mean() - 2.0) < 1e-1
    assert np.fabs(np.sqrt(output.numpy().var()) - np.sqrt(2.0)) < 1e-1
    assert str(output.device) == str(cn)


@pytest.mark.skipif(
    get_device_count("xpu") <= 2, reason="xpu counts need > 2",
)
def test_permutation_op():
    set_global_seed(1024)
    n = 1000

    def test_permutation_op_dtype(dtype):
        def sum_result(res, fun):
            return sum([1 if i == v else 0 for i, v in enumerate(fun(res.numpy()))])

        shape = Tensor((n,), dtype="int32")
        op = PermutationRNG(seed=get_global_rng_seed(), dtype=dtype)
        (output,) = apply(op, shape)
        assert sum_result(output, lambda x: x) < 500
        assert sum_result(output, np.sort) == n
        assert str(output.device) == str(CompNode("xpux"))
        assert output.dtype == dtype

        cn = CompNode("xpu2")
        seed = 233333
        h = new_rng_handle(cn, seed)
        op = PermutationRNG(seed=seed, handle=h, dtype=dtype)
        (output,) = apply(op, shape)
        delete_rng_handle(h)
        assert sum_result(output, lambda x: x) < 500
        assert sum_result(output, np.sort) == n
        assert str(output.device) == str(cn)
        assert output.dtype == dtype

    test_permutation_op_dtype(np.float32)
    test_permutation_op_dtype(np.int32)
    test_permutation_op_dtype(np.int16)


@pytest.mark.skipif(
    get_device_count("xpu") <= 1, reason="xpu counts need > 1",
)
def test_UniformRNG():
    m1 = RNG(seed=111, device="xpu0")
    m2 = RNG(seed=111, device="xpu1")
    m3 = RNG(seed=222, device="xpu0")
    out1 = m1.uniform(size=(100,))
    out1_ = m1.uniform(size=(100,))
    out2 = m2.uniform(size=(100,))
    out3 = m3.uniform(size=(100,))

    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
    assert out1.device == "xpu0" and out2.device == "xpu1"
    assert not (out1.numpy() == out3.numpy()).all()
    assert not (out1.numpy() == out1_.numpy()).all()

    low = -234
    high = 123
    out = m1.uniform(low=low, high=high, size=(20, 30, 40))
    out_shp = out.shape
    if isinstance(out_shp, tuple):
        assert out_shp == (20, 30, 40)
    else:
        assert all(out.shape.numpy() == np.array([20, 30, 40]))
    assert np.abs(out.mean().numpy() - ((low + high) / 2)) / (high - low) < 0.1


@pytest.mark.skipif(
    get_device_count("xpu") <= 1, reason="xpu counts need > 1",
)
def test_NormalRNG():
    m1 = RNG(seed=111, device="xpu0")
    m2 = RNG(seed=111, device="xpu1")
    m3 = RNG(seed=222, device="xpu0")
    out1 = m1.normal(size=(100,))
    out1_ = m1.uniform(size=(100,))
    out2 = m2.normal(size=(100,))
    out3 = m3.normal(size=(100,))

    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
    assert out1.device == "xpu0" and out2.device == "xpu1"
    assert not (out1.numpy() == out3.numpy()).all()
    assert not (out1.numpy() == out1_.numpy()).all()

    mean = -1
    std = 2
    out = m1.normal(mean=mean, std=std, size=(20, 30, 40))
    out_shp = out.shape
    if isinstance(out_shp, tuple):
        assert out_shp == (20, 30, 40)
    else:
        assert all(out.shape.numpy() == np.array([20, 30, 40]))
    assert np.abs(out.mean().numpy() - mean) / std < 0.1
    assert np.abs(np.std(out.numpy()) - std) < 0.1


@pytest.mark.skipif(
    get_device_count("xpu") <= 1, reason="xpu counts need > 1",
)
def test_GammaRNG():
    m1 = RNG(seed=111, device="xpu0")
    m2 = RNG(seed=111, device="xpu1")
    m3 = RNG(seed=222, device="xpu0")
    out1 = m1.gamma(2, size=(100,))
    out1_ = m1.uniform(size=(100,))
    out2 = m2.gamma(2, size=(100,))
    out3 = m3.gamma(2, size=(100,))

    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
    assert out1.device == "xpu0" and out2.device == "xpu1"
    assert not (out1.numpy() == out3.numpy()).all()
    assert not (out1.numpy() == out1_.numpy()).all()

    shape = Tensor([[2, 3, 4], [9, 10, 11]], dtype=np.float32, device="xpu0")
    scale = Tensor([0.5, 1, 1.5], dtype=np.float32, device="xpu0")
    expected_mean = (shape * scale).numpy()
    expected_std = (F.sqrt(shape) * scale).numpy()
    out = m1.gamma(shape=shape, scale=scale, size=(20, 30, 40))
    out_shp = out.shape
    if isinstance(out_shp, tuple):
        assert out_shp == (20, 30, 40, 2, 3)
    else:
        assert all(out.shape.numpy() == np.array([20, 30, 40, 2, 3]))
    assert (
        np.abs(out.mean(axis=(0, 1)).numpy() - expected_mean) / expected_std
    ).mean() < 0.1
    assert (np.abs(np.std(out.numpy(), axis=(0, 1)) - expected_std)).mean() < 0.1


@pytest.mark.skipif(
    get_device_count("xpu") <= 1, reason="xpu counts need > 1",
)
def test_BetaRNG():
    m1 = RNG(seed=111, device="xpu0")
    m2 = RNG(seed=111, device="xpu1")
    m3 = RNG(seed=222, device="xpu0")
    out1 = m1.beta(2, 1, size=(100,))
    out1_ = m1.uniform(size=(100,))
    out2 = m2.beta(2, 1, size=(100,))
    out3 = m3.beta(2, 1, size=(100,))

    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
    assert out1.device == "xpu0" and out2.device == "xpu1"
    assert not (out1.numpy() == out3.numpy()).all()
    assert not (out1.numpy() == out1_.numpy()).all()

    alpha = Tensor([[2, 3, 4], [9, 10, 11]], dtype=np.float32, device="xpu0")
    beta = Tensor([0.5, 1, 1.5], dtype=np.float32, device="xpu0")
    expected_mean = (alpha / (alpha + beta)).numpy()
    expected_std = (
        F.sqrt(alpha * beta / (F.pow(alpha + beta, 2) * (alpha + beta + 1)))
    ).numpy()
    out = m1.beta(alpha=alpha, beta=beta, size=(20, 30))
    out_shp = out.shape
    if isinstance(out_shp, tuple):
        assert out_shp == (20, 30, 2, 3)
    else:
        assert all(out.shape.numpy() == np.array([20, 30, 2, 3]))
    assert (
        np.abs(out.mean(axis=(0, 1)).numpy() - expected_mean) / expected_std
    ).mean() < 0.1
    assert (np.abs(np.std(out.numpy(), axis=(0, 1)) - expected_std)).mean() < 0.1


@pytest.mark.skipif(
    get_device_count("xpu") <= 1, reason="xpu counts need > 1",
)
def test_PoissonRNG():
    m1 = RNG(seed=111, device="xpu0")
    m2 = RNG(seed=111, device="xpu1")
    m3 = RNG(seed=222, device="xpu0")
    lam = Tensor([[2, 3, 4], [9, 10, 11]], dtype=np.float32)
    out1 = m1.poisson(lam.to("xpu0"), size=(100,))
    out2 = m2.poisson(lam.to("xpu1"), size=(100,))
    out3 = m3.poisson(lam.to("xpu0"), size=(100,))

    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
    assert out1.device == "xpu0" and out2.device == "xpu1"
    assert not (out1.numpy() == out3.numpy()).all()

    out = m1.poisson(lam.to("xpu0"), size=(20, 30))
    out_shp = out.shape
    expected_shape = (20, 30) + lam._tuple_shape
    if isinstance(out_shp, tuple):
        assert out_shp == expected_shape
    else:
        assert all(out.shape.numpy() == np.array(expected_shape))
    lam = lam.numpy()

    assert (np.abs(out.mean(axis=(0, 1)).numpy() - lam) / np.sqrt(lam)).mean() < 0.1
    assert np.abs(np.std(out.numpy(), axis=(0, 1)) - np.sqrt(lam)).mean() < 0.1


@pytest.mark.skipif(
    get_device_count("xpu") <= 1, reason="xpu counts need > 1",
)
@pytest.mark.parametrize("symbolic", [True, False])
def test_PermutationRNG(symbolic):
    m1 = RNG(seed=111, device="xpu0")
    m2 = RNG(seed=111, device="xpu1")
    m3 = RNG(seed=222, device="xpu0")
    out1 = m1.permutation(1000)
    out1_ = m1.uniform(size=(1000,))
    out2 = m2.permutation(1000)
    out3 = m3.permutation(1000)

    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
    assert out1.device == "xpu0" and out2.device == "xpu1"
    assert not (out1.numpy() == out3.numpy()).all()
    assert not (out1.numpy() == out1_.numpy()).all()

    out = m1.permutation(1000)
    out_shp = out.shape
    if isinstance(out_shp, tuple):
        assert out_shp == (1000,)
    else:
        assert all(out.shape.numpy() == np.array([1000]))

    def sum_result(res, fun):
        return sum([1 if i == v else 0 for i, v in enumerate(fun(res.numpy()))])

    assert sum_result(out, lambda x: x) < 500
    assert sum_result(out, np.sort) == 1000

    def func():
        out = m1.permutation(Tensor(7))
        out_shp = out.shape
        if isinstance(out_shp, tuple):
            assert out_shp == (1,)
        else:
            assert all(out.shape.numpy() == np.array([1]))
        n, m = 6, 3
        out = m1.permutation(Tensor(np.arange(n * m), dtype="float32").reshape(n, m))
        out_shp = out.shape
        if isinstance(out_shp, tuple):
            assert out_shp == (n, m)
        else:
            assert all(out.shape.numpy() == np.array([n, m]))

    func = trace(symbolic=symbolic)(func)
    func()


@pytest.mark.skipif(
    get_device_count("xpu") <= 1, reason="xpu counts need > 1",
)
def test_ShuffleRNG():
    g = []

    def cb(grad):
        g.append(grad)

    n, m = 6, 3
    arr = np.arange(n * m)
    out0 = Tensor(arr, dtype="float32")
    with Grad() as grad:
        grad.wrt(out0, callback=cb)
        random.shuffle(out0)
        grad(out0, F.ones_like(out0))
    m1 = RNG(seed=111, device="xpu0")
    m2 = RNG(seed=111, device="xpu1")
    m3 = RNG(seed=222, device="xpu0")
    out1 = Tensor(arr, dtype="float32", device="xpu0")
    out2 = Tensor(arr, dtype="float32", device="xpu1")
    out3 = Tensor(arr, dtype="float32", device="xpu0")
    m1.shuffle(out1)
    m2.shuffle(out2)
    m3.shuffle(out3)

    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
    assert out1.device == "xpu0" and out2.device == "xpu1"
    assert not (out1.numpy() == out3.numpy()).all()

    out = Tensor(arr, dtype="float32").reshape(n, m)
    m1.shuffle(out)

    out_shp = out.shape
    if isinstance(out_shp, tuple):
        assert out_shp == (n, m)
    else:
        assert all(out.shape.numpy() == np.array([n, m]))


def test_seed():
    set_global_seed(10)
    out1 = uniform(size=[10, 10])
    out2 = uniform(size=[10, 10])
    assert not (out1.numpy() == out2.numpy()).all()

    set_global_seed(10)
    out3 = uniform(size=[10, 10])
    np.testing.assert_allclose(out1.numpy(), out3.numpy(), atol=1e-6)

    set_global_seed(11)
    out4 = uniform(size=[10, 10])
    assert not (out1.numpy() == out4.numpy()).all()


@pytest.mark.parametrize("is_symbolic", [None, False, True])
def test_rng_empty_tensor(is_symbolic):
    set_global_seed(1024)
    shapes = [
        (0,),
        (0, 0, 0),
        (10, 0, 10),
    ]

    def fn(shape):
        o1 = random.uniform(0, 1, shape)
        o2 = random.normal(0, 1, shape)
        o3 = random.gamma(2, 1, shape)
        o4 = random.beta(2, 1, shape)
        o5 = random.poisson(2, shape)
        return o1, o2, o3, o4, o5

    for shape in shapes:
        if is_symbolic is not None:
            fn_ = jit.trace(symbolic=is_symbolic)(fn)
        else:
            fn_ = fn
        for _ in range(3):
            outs = fn_(shape)
            for out in outs:
                np.testing.assert_equal(out.numpy().shape, shape)
            if is_symbolic is None:
                break

    def fn2(n):
        return random.permutation(n=n)

    if is_symbolic is not None:
        fn2 = jit.trace(symbolic=is_symbolic)(fn2)

    for _ in range(3):
        out = fn2(0)
        np.testing.assert_equal(out.numpy().shape, (0,))
        if is_symbolic is None:
            break
    mge.core.set_option("async_level", 2)