
### Procedures to analyse RTRBM prediction performances.

import rbm

def norm(x):
    return (x * x).mean()

## When predicting, we simply sample from the conditional RBM in each timestep,
## and compare the difference. We do that for 50 gibbs steps. Other numbers were not tried.
## This number shouldn't matter much (hopefully).

g_gen = 50

def empirically_evaluate(t, num_seq):

    """
    basically, what do we want to do here? 
    I want to run inference (be it mean field or not) and then
    predict the next timestep. 
    
    This is kind of nice because the way in which we do the
    approximate prediction is identical in both the TRBM and the
    RTRBM. 

    In particular, the RTRBM uses mean-field for inference. 
    """

    from pylab import newaxis, sqrt
    import gnumpy as gpu
    V = gpu.zeros((num_seq, 100, 900))
    for i in range(num_seq):
        V_array = t.valid_data_fn()
        V[i, :, :] = V_array
    W = t.W
    ## step 1: do the approximate mean-field inference (note: that's
    ## also the way the TRBM does inference--the TRBM does not
    ## sample the hiddens, but only computes their "mean-field"
    ## values, which is identical to the inference procedure of
    ## the RTRBM).
    bs, T, v = V.shape
    h = W.h
    assert(W.v == v)
    VH, HH, b_init = W
    #---------------------------------## GPU
    H = gpu.zeros((bs, T, h))         ## activation matriCES for hidden units (H[t] -> activations at time t)
    B = gpu.zeros((bs, T, h))         ## bias matriCES for hidden units (B[t] -> biases at time t)
    ### run the network over the whole sequence (computes all of r_t)
    ### first element of the sequence:
    H[:, 0, :] = (VH * V[:, 0, :] + b_init).logistic()
    ### rest of the sequence:
    for t in range(1, T):
        ### first, update biases for hidden units
        B[:, t, :] = HH * H[:, t-1, :]
        ### then calculate activations
        H[:, t, :] = (VH * V[:, t, :] + B[:, t, :]).logistic()
    loss = 0
    for t in range(2, T):
        target = V[:, t, :]
        ## VH_t is the set of weights of the "RBM" at time t.
        ## it is the same as normal vis-hid, except that it has 
        ## the additional bias from the previous time step.
        VH_t = 1 * VH
        VH_t[2] = VH[2] + B[:, t, :]
        ## original bias + extra, dynamic bias.
	## the point of sampling the last one from MF is so that in the gaussian case
        pred, hid = rbm.sample_last_mf(VH_t, g_gen, 1, W.vis_gauss)
        loss += ((target - pred)**2).sum()
    return (float(loss) / (T * bs)) / v



def match_successor_distribution(t, context_tree, gibbs_steps):

    from pylab import zeros, sigmoid, newaxis, sqrt
    from scipy.spatial.distance import mahalanobis
    import data.words   as words
    import gnumpy       as gpu
    import numpy        as np
    import numpy.linalg as la

    total_KL = 0
    ALPHABET_SIZE = 27
    W = t.W
    v = ALPHABET_SIZE
    h = W.h
    assert(W.v == v)
    VH, HH, b_init = W
    VH[0][0] = gpu.as_numpy_array(VH[0][0])
    VH[1]    = gpu.as_numpy_array(VH[1])
    VH[2]    = gpu.as_numpy_array(VH[2])
    HH[0][0] = gpu.as_numpy_array(HH[0][0])
    HH[1]    = gpu.as_numpy_array(HH[1])
    HH[2]    = gpu.as_numpy_array(HH[2])
    b_init   = gpu.as_numpy_array(b_init)
    
    ### visit the contexts tree breadth-first and compute prediction errors
    tot_eval = tot_M = tot_nan = 0
    error_cos = error_L2 = error_KL = error_M = 0
    queue = []
    queue.append(context_tree.head)
    while queue:
        p = []                      ## vector of successors distribution
        current = queue.pop(0)      ## current prefix to analyse
        count = 0
        ## extract each child from the current prefix and append them to the queue
        for ch in range(ALPHABET_SIZE):
            if current.children[ch] != None:
                queue.append(current.children[ch])
                p.append(current.children[ch].prob)
            else:
                count += 1
                p.append(0)
        ### condition the network and calculate dynamic bias
        if count < ALPHABET_SIZE:
            target = p
            context = current.content
            VH_t = 1 * VH
            if context != '':
                ### if there is context, condition the network.
                ### too few visible units to use gpu
                context_length = len(context)
                V = zeros((context_length, ALPHABET_SIZE))
                H = zeros((context_length, h))
                B = zeros((context_length, h))
                ### run the network with the given context
                V[[0]] = words.codify_letter(context[0])
                H[[0]] = sigmoid(gpu.as_numpy_array(VH * V[[0]]) + b_init[newaxis, :])
                for t in range(1, context_length):
                    B[[t]] = gpu.as_numpy_array(HH * H[[t - 1]])
                    V[[t]] = words.codify_letter(context[t])
                    H[[t]] = sigmoid(gpu.as_numpy_array(VH * V[[t]]) + B[[t]])
                B[[-1]] = gpu.as_numpy_array(HH * H[[-1]])
                ### use the last hidden units biases to predict the next element
                VH_t[2] = VH[2] + B[[-1]]
            else:
                ### if no context is given, add the initial bias
                VH_t[2] = VH[2] + b_init
                
            #pred, hid = rbm.sample_last_mf_no_GPU_no_stochastic(VH_t, gibbs_steps, 1, W.vis_gauss)
            #pred = pred[0]
            pred, hid = rbm.sample_last_mf(VH_t, gibbs_steps, 10000, W.vis_gauss)
            pred = gpu.as_numpy_array(pred)
            pred = np.mean(pred, 0)
            
            #pred += 0.000000001
            total = pred.sum()
            norm_pred = pred / total
            target = np.array(target)
            ### L2 norm:
            prefix_error_L2 = sqrt((((target - norm_pred)**2).sum()) / v)
            ### cosine similarity:
            prefix_error_cos = np.inner(target, norm_pred) / (la.norm(target) * la.norm(norm_pred))
            ### KL-divergence:
            positives = (target != 0.) & (norm_pred != 0.)
            #positives = (target != 0) & (norm_pred > 0.000001)
##            for elem in range(len(target)):
##                if target[elem] < 0.0000000001:
##                    target[elem] = 0.0000001
##                if norm_pred[elem] < 0.0000000001:
##                    norm_pred[elem] = 0.0000001
##            positives = (target > 0.00000000001) & (norm_pred > 0.00000000001)
            total_KL += np.count_nonzero(positives)
            target = target[positives]
            norm_pred = norm_pred[positives]
            prefix_error_KL = np.sum(target * (np.log2(target) - np.log2(norm_pred)))
##            ### Mahalanobis distance:
##            CC = np.column_stack((target, norm_pred))
##            C = np.cov(CC)
##            try:
##                VI = la.inv(C)
##                prefix_error_Maha = mahalanobis(target, norm_pred, VI) #mahalanobis()
##                if not(np.isnan(prefix_error_Maha)):
##                    error_M  += prefix_error_Maha
##                    tot_M += 1
##                else:
##                    tot_nan += 1
##            except:
##                print target
##                print norm_pred
            error_cos += prefix_error_cos
            error_L2 += prefix_error_L2
            error_KL += prefix_error_KL
            tot_eval += 1
    return (float(error_cos) / tot_eval), (float(error_KL) / tot_eval), (float(error_L2) / tot_eval)#, (float(error_M) / tot_M)


def likelihood(t, sequence, gibbs_steps):

    from pylab     import newaxis, amap, binary_repr, zeros, log, exp, log_sum_exp, sigmoid, stochastic, Rsigmoid, rand, randn
    import data.words   as words
    import gnumpy       as gpu
    import numpy        as np
    import numpy.linalg as la
    from std.basic import Rsigmoid_no_GPU

    alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
    ALPHABET_SIZE = 27
    W = t.W
    v = ALPHABET_SIZE
    h = W.h
    assert(W.v == v)
    VH, HH, b_init = W
    VH[0][0] = gpu.as_numpy_array(VH[0][0])
    VH[1]    = gpu.as_numpy_array(VH[1])
    VH[2]    = gpu.as_numpy_array(VH[2])
    HH[0][0] = gpu.as_numpy_array(HH[0][0])
    HH[1]    = gpu.as_numpy_array(HH[1])
    HH[2]    = gpu.as_numpy_array(HH[2])
    b_init   = gpu.as_numpy_array(b_init)
    lastH    = rand(1, h)

    sequence = gpu.as_numpy_array(sequence)
    context_length = len(sequence)
    VH_t = 1 * VH
    if len(sequence) > 1:
        ### if there is context, condition the network
        ### too few visible units to use gpu
        V = zeros((context_length, ALPHABET_SIZE))
        H = zeros((context_length, h))
        B = zeros((context_length, h))
        ### run the network with the given context
        V[[0]] = words.codify_letter(sequence[0])
        H[[0]] = sigmoid(gpu.as_numpy_array(VH * V[[0]]) + b_init[newaxis, :])
        for t in range(1, context_length - 1):
            B[[t]] = gpu.as_numpy_array(HH * H[[t - 1]])
            V[[t]] = words.codify_letter(sequence[t])
            H[[t]] = sigmoid(gpu.as_numpy_array(VH * V[[t]]) + B[[t]])
            lastH[0] = H[[t]]
        B[[-1]] = gpu.as_numpy_array(HH * H[[-1]])
        ### use the last hidden units biases to predict the next element
        VH_t[2] = VH[2] + B[[-1]]
    else:
        ### if no context is given, add the initial bias
        VH_t[2] = VH[2] + b_init

    #pred, hid = rbm.sample_last_mf_no_GPU_no_stochastic(VH_t, gibbs_steps, 1, W.vis_gauss)
    #pred = pred[0]
    pred, hid = rbm.sample_last_mf(VH_t, gibbs_steps, 10000, W.vis_gauss)
    pred = gpu.as_numpy_array(pred)
    pred = np.mean(pred, 0)
    ### smooth zero entries and re-normalize activations
    #pred += 0.0001
    total = pred.sum()
    norm_pred = pred / total
    idx = np.where(alphabet == sequence[-1])[0]
    likelihood = norm_pred[idx]
    #print norm_pred
    #print likelihood
    #print '\n'
    return likelihood


def calculate_perplexity(t, data, gibbs_steps):
    import numpy as np
    n = 0
    lik = 0
    for word in data:
        sequence = []
        for letter in word:
            sequence.append(letter)
            l = likelihood(t, sequence, gibbs_steps)
            if l == 0:
                print '\nZero likelihood: ', sequence
            lik += np.log(l)
            n += 1
        sequence.append('$')
        l = likelihood(t, sequence, gibbs_steps)
        if l == 0:
            print '\nZero likelihood: ', sequence
        lik += np.log(l)
        n += 1
    print '\nn: ', n, '\n'
    perplexity = np.exp(-lik/n)
    return perplexity


