
### Set of procedures to analyse hidden units activations.

from   copy       import deepcopy
import std.basic  as bas
import numpy      as np
import gnumpy     as gpu
import data.words as words


alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$']

def distances_stats(input_file, output_file):
    print 'Loading patterns...',
    from collections import defaultdict
    f = open(input_file, 'r')
    f_out = open(output_file, 'w')
    patt_dict = defaultdict(list)
    patt_same = defaultdict(list)
    patterns = []
    while True:
        l = f.readline()
        if not l:
            break
        word = l[0:-1]
        length = len(word)
        for p in range(0, length): ## last activations
            line = f.readline()
        pattern = np.fromstring(line, dtype = float, sep = ' ')
        features = len(pattern)
        if len(patt_dict[word]) == 0:
            patt_dict[word].append(pattern)
            patt_same[word].append(pattern)
            patterns.append(pattern)
        else:
            for p in patt_dict[word]:
                if not(np.array_equal(p, pattern)):
                    print 'Different patterns!'
                    print word
                    for n in range(0, len(p)):
                        print p[n], '\t', pattern[n]
                    print '\n\n'
            patt_same[word].append(pattern) ## now, this never happens
        for p in range(0, length): ## skip visible
            line = f.readline()
        line = f.readline()
        line = f.readline()
        next_item = f.readline()
    f.close()
    print 'done.'
    patterns = np.array(patterns)
    stats_between = []
    stats_within  = []
    n, features = patterns.shape
    print 'Words: ', n
    print 'Features: ', features
    intervals = 40
    freq_between = np.zeros(intervals)
    freq_within  = np.zeros(intervals)
    step = 0.4 / float(intervals)
    print 'Step:', step
    f_out.write('Distances:\n\n')
    for i in range(0, n):
        sum_dist = 0
        min_dist = 1
        p = patterns[i]
        for j in range(0, n):
            if i < j:
                q = patterns[j]
                dist = np.sqrt((((p - q)**2).sum()) / features)
                stats_between.append(dist)
                f_out.write(str(dist) + '\n')
    print '...between stats collected.'
##    for key, value in patt_same.items():
##        numb = len(value)
##        for i in range(0, numb):
##            p = value[i]
##            for j in range(0, numb):
##                if i < j:
##                    q = value[j]
##                    if not(np.array_equal(p, q)):
##                        dist = np.sqrt((((p - q)**2).sum()) / features)
##                        stats_within.append(dist)
##    print '...within stats collected.'

    mean_dist = np.mean(stats_between)
    std_dev = np.std(stats_between)
    f_out.write('\n\nAverage distance: ' + str(mean_dist) + '  std.dev.: ' + str(std_dev) + '\n\n')
    
    stats_between = np.array(stats_between)
    idxs = np.floor(stats_between/float(step))
    for idx in idxs:
        freq_between[idx] += 1
##    stats_within = np.array(stats_within)
##    idxs = np.floor(stats_within/float(step))
##    for idx in idxs:
##        freq_within[idx] += 1
    f_out.write('Between classes:\n')
    for j in freq_between:
        f_out.write(str(j)+'\n')
##    f.write('\nWithin classes:\n')
##    for j in freq_within:
##        f.write(str(j)+'\n')
    f_out.close()


def compare_representations_and_Levenshtein(training_set, input_file):
    closests = {}   ## dictionary key: word; content: (most_similar_words_1[], most_similar_words_2[], most_similar_words_3[])
    target   = []   ## list of words to analyse
    sampled  = []
    words_representations = {}
    words_visibles = {}
    for i in range(0, training_set.shape[0]):
        closests_i_1 = []
        closests_i_2 = []
        closests_i_3 = []
        closests_i_4 = []
        closests_i_5 = []
        closests_i_6 = []
        closests_i_7 = []
        for j in range(0, training_set.shape[0]):
            if (i < j):
                dist = bas.compute_Levenshtein_distance(training_set[i], training_set[j])
                if (dist == 1):
                    closests_i_1.append(training_set[j])
                elif (dist == 2):
                    closests_i_2.append(training_set[j])
                elif (dist == 3):
                    closests_i_3.append(training_set[j])
                elif (dist == 4):
                    closests_i_4.append(training_set[j])
                elif (dist == 5):
                    closests_i_5.append(training_set[j])
                elif (dist == 6):
                    closests_i_6.append(training_set[j])
                elif (dist == 7):
                    closests_i_7.append(training_set[j])
        if (closests_i_1) or (closests_i_2) or (closests_i_3) or (closests_i_4) or (closests_i_5) or (closests_i_6) or (closests_i_7):
            closests[training_set[i]] = (closests_i_1, closests_i_2, closests_i_3, closests_i_4, closests_i_5, closests_i_6, closests_i_7)
            target.append(training_set[i])
    print i
    print '\nTotal target words:\n', len(target)

    ## load internal representations instead of sampling each time
    missing = deepcopy(target)
    f = open(input_file, 'r')
    while True:
        l = f.readline()
        if not l:
            break
        word = l[0:-1]
        length = len(word)
        for p in range(0, length): ## last activations
            line = f.readline()
        pattern = np.fromstring(line, dtype = float, sep=' ')
        words_representations[word] = pattern
        sampled.append(word)
        if (word in missing):
            missing.remove(word)
        for p in range(0, length): ## skip visible activations
            line = f.readline()
        line = f.readline()
        line = f.readline()
        next_item = f.readline()
    f.close()
    h = len(pattern)
    print '\nTotal target words not generated:\n', len(missing), '\n'
    found = list(set(target) - set(missing))


##    ## print, for every word, its average distance with all other words
##    ## and its distance with closest words (grouping by Levenshtein distances)
##    ## only words with distance smaller than 0.1 will be printed
##    for w in found:
##        print w
##        dist_sum = 0
##        w_representation = words_representations[w]
##        for r in sampled:
##            r_representation = words_representations[r]
##            dist_sum += bas.compute_activations_similarity(w_representation, r_representation, 'L2')
##        avg = float(dist_sum) / len(sampled)
##        print 'Avg distance:\t%.4f' % avg
##        are_closer = False  ## to signal if there exist representations that are at least similar
##        not_empty  = False  ## to signal if there is at least one word in the list
##
##        ## Distance 1:
##        print '\nDistance 1'
##        for most_similar in closests[w][0]:
##            ## compute similarity between w and its closest words (if generated)
##            if most_similar in found:
##                not_empty  = True
##                most_similar_representation = words_representations[most_similar]
##                dist = bas.compute_activations_similarity(w_representation, most_similar_representation, 'L2')
##                if dist < 0.001:
##                #if dist > 0.96:
##                    are_closer = True
##                    max_dist, idx = bas.compute_activations_maxdiff(w_representation, most_similar_representation)
##                    print '\t', most_similar,
##                    print '\t%.4f ***' % dist,
##                    print '\tmax_dist %.4f' % max_dist,
##                    print ' on neuron ', idx
##                    #bas.show_activations_histograms(w, w_representation, most_similar, most_similar_representation)
##                elif dist < 0.01:
##                #elif dist > 0.9:
##                    are_closer = True
##                    print '\t', most_similar,
##                    print '\t%.4f' % dist
##                if dist == 0:
##                    bas.show_activations_matrices(w, words_visibles[w][0:9, :], most_similar, words_visibles[most_similar][0:9, :])
####        if not_empty  == True:
####            if are_closer == False:
####                print '(representations are never closer than 0.1)'
####        else:
####            print '(no words with such a distance)'
##        are_closer = False
##        not_empty  = False
##        
##        ## Distance 2:
##        print '\nDistance 2'
##        for most_similar in closests[w][1]:
##            ## compute similarity between w and its closest words (if generated)
##            if most_similar in found:
##                not_empty  = True
##                most_similar_representation = words_representations[most_similar]
##                dist = bas.compute_activations_similarity(w_representation, most_similar_representation, 'L2')
##                if dist < 0.001:
##                #if dist > 0.96:
##                    are_closer = True
##                    max_dist, idx = bas.compute_activations_maxdiff(w_representation, most_similar_representation)
##                    print '\t', most_similar,
##                    print '\t%.4f ***' % dist,
##                    print '\tmax_dist %.4f' % max_dist,
##                    print ' on neuron ', idx
##                elif dist < 0.01:
##                #elif dist > 0.9:
##                    are_closer = True
##                    print '\t', most_similar,
##                    print '\t%.4f' % dist
####        if not_empty  == True:
####            if are_closer == False:
####                print '(representations are never closer than 0.1)'
####        else:
####            print '(no words with such a distance)'
##        are_closer = False
##        not_empty  = False
##        
##        ## Distance 3:
##        print '\nDistance 3'
##        for most_similar in closests[w][2]:
##            ## compute similarity between w and its closest words (if generated)
##            if most_similar in found:
##                not_empty  = True
##                most_similar_representation = words_representations[most_similar]
##                dist = bas.compute_activations_similarity(w_representation, most_similar_representation, 'L2')
##                if dist < 0.001:
##                #if dist > 0.96:
##                    are_closer = True
##                    max_dist, idx = bas.compute_activations_maxdiff(w_representation, most_similar_representation)
##                    print '\t', most_similar,
##                    print '\t%.4f ***' % dist,
##                    print '\tmax_dist %.4f' % max_dist,
##                    print ' on neuron ', idx
##                elif dist < 0.01:
##                #elif dist > 0.9:
##                    are_closer = True
##                    print '\t', most_similar,
##                    print '\t%.4f' % dist
####        if not_empty  == True:
####            if are_closer == False:
####                print '(representations are never closer than 0.1)'
####        else:
####            print '(no words with such a distance)'
##        are_closer = False
##        not_empty  = False
##        
##        print '\n\n'

    d1_dist_sum = []
    d2_dist_sum = []
    d3_dist_sum = []
    d4_dist_sum = []
    d5_dist_sum = []
    d6_dist_sum = []
    d7_dist_sum = []
    dist_sum = []
    count_closer = count_further = count_nan = count_except = 0
    d1_ = d2_ = d3_ = d4_ = d5_ = d6_ = d7_ = 0
    d1_f = d2_f = d3_f = d4_f = d5_f = d6_f = d7_f = 0
    threshold = 0.07232
    n = len(sampled)
    sampled2 = deepcopy(sampled)
    for w1 in sampled:
        w1_representation = words_representations[w1]
        sampled2.remove(w1)
        for w2 in sampled2:
            w2_representation = words_representations[w2]
            sim = bas.compute_activations_similarity(w1_representation, w2_representation, 'L2')
            dist_sum.append(sim)
            if sim == 0:
                count_nan += 1
                print w1, ' ', w2
            else:
                if   (w2 in closests[w1][0]):
                    d1_dist_sum.append(sim)
                    if sim < threshold:
                        count_closer += 1
                        d1_ += 1
                    else:
                        count_further += 1
                        d1_f += 1
                elif (w2 in closests[w1][1]):
                    d2_dist_sum.append(sim)
                    if sim < threshold:
                        count_closer += 1
                        d2_ += 1
                    else:
                        count_further += 1
                        d2_f += 1
                elif (w2 in closests[w1][2]):
                    d3_dist_sum.append(sim)
                    if sim < threshold:
                        count_closer += 1
                        d3_ += 1
                    else:
                        count_further += 1
                        d3_f += 1
                elif (w2 in closests[w1][3]):
                    d4_dist_sum.append(sim)
                    if sim < threshold:
                        count_closer += 1
                        d4_ += 1
                    else:
                        count_further += 1
                        d4_f += 1
                elif (w2 in closests[w1][4]):
                    d5_dist_sum.append(sim)
                    if sim < threshold:
                        count_closer += 1
                        d5_ += 1
                    else:
                        count_further += 1
                        d5_f += 1
                elif (w2 in closests[w1][5]):
                    d6_dist_sum.append(sim)
                    if sim < threshold:
                        count_closer += 1
                        d6_ += 1
                    else:
                        count_further += 1
                        d6_f += 1
                elif (w2 in closests[w1][6]):
                    d7_dist_sum.append(sim)
                    if sim < threshold:
                        count_closer += 1
                        d7_ += 1
                    else:
                        count_further += 1
                        d7_f += 1
    
    d_ = np.array(dist_sum)
    d1 = np.array(d1_dist_sum)
    d2 = np.array(d2_dist_sum)
    d3 = np.array(d3_dist_sum)
    d4 = np.array(d4_dist_sum)
    d5 = np.array(d5_dist_sum)
    d6 = np.array(d6_dist_sum)
    d7 = np.array(d7_dist_sum)
    ## calculate average distance between all the (generated) representations
    print '\nAverage distance between all the representations: %.3f %.3f' % (np.mean(d_), np.std(d_))
    ## calculate average distance between all the (generated) representations of words with Levenshtein distance = 1
    print '\nAverage distance between representations at LD1: %.3f %.3f' % (np.mean(d1), np.std(d1))
    print 'Average distance between representations at LD2: %.3f %.3f' % (np.mean(d2), np.std(d2))
    print 'Average distance between representations at LD3: %.3f %.3f' % (np.mean(d3), np.std(d3))
    print 'Average distance between representations at LD4: %.3f %.3f' % (np.mean(d4), np.std(d4))
    print 'Average distance between representations at LD5: %.3f %.3f' % (np.mean(d5), np.std(d5))
    print 'Average distance between representations at LD6: %.3f %.3f' % (np.mean(d6), np.std(d6))
    print 'Average distance between representations at LD7: %.3f %.3f' % (np.mean(d7), np.std(d7))
    print '\nPercentage of words with representations closer than 0.071: %.4f' % (float(count_closer) / (count_closer + count_further))
    print 'Tot distances computed: ', count_closer + count_further
    print  d1_, d2_, d3_, d4_, d5_, d6_, d7_
    print  d1_f, d2_f, d3_f, d4_f, d5_f, d6_f, d7_f
    #print 'Except: ', count_except
    print 'Zero distances: ', count_nan

    ## regression error:
    from scipy import linspace, polyval, polyfit, sqrt, stats, randn
    from pylab import plot, title, show , legend
    from matplotlib import pyplot as PLT
    t = np.zeros(len(d1)+len(d2)+len(d3)+len(d4)+len(d5)+len(d6)+len(d7))
    xn = np.zeros(len(d1)+len(d2)+len(d3)+len(d4)+len(d5)+len(d6)+len(d7))
    print '\nsize of the sample: ', (len(d1)+len(d2)+len(d3)+len(d4)+len(d5+len(d6)+len(d7)))
    for p in range(0, len(d1)):
        t[p]  = 1
        xn[p] = d1[p]
    for q in range(0, len(d2)):
        t[p+q]  = 2
        xn[p+q] = d2[q]
    for r in range(0, len(d3)):
        t[p+q+r]  = 3
        xn[p+q+r] = d3[r]
    for s in range(0, len(d4)):
        t[p+q+r+s]  = 4
        xn[p+q+r+s] = d4[s]
    for u in range(0, len(d5)):
        t[p+q+r+s+u]  = 5
        xn[p+q+r+s+u] = d5[u]
    for v in range(0, len(d6)):
        t[p+q+r+s+u+v]  = 6
        xn[p+q+r+s+u+v] = d6[v]
    for zz in range(0, len(d7)):
        t[p+q+r+s+u+v+zz]  = 7
        xn[p+q+r+s+u+v+zz] = d7[zz]
    # linear regression - polyfit
    (ar, br) = polyfit(t, xn, 1)
    xr = polyval([ar, br], t)
    correlation = np.corrcoef(t, xn)[0,1]
    print('\na=%.2f\tb=%.2f' % (ar, br))
    print 'r: ', correlation, '\n'


##    # linear regression - linregress
##    slope, intercept, r_value, p_value, std_err = stats.linregress(t, xn)
##    print('\ta=%.2f\tb=%.2' % (slope, intercept))
##    print 'r-square: ', r_value

##    fig = PLT.figure()
##    ax = fig.add_subplot(1,1,1)
##    ax.plot(t, xn, 'g.', t, xr, 'b-', markersize=1)
##    PLT.xlim( (0, 8)   )
##    PLT.ylim( (0, 0.6) )
##    PLT.show()

##    # Linear regression example
##    # including confidence intervals for params
##    from scipy import linspace, polyval, polyfit, sqrt, stats, randn
##    from pylab import figure, plot, title, show , legend, xlabel, ylabel, grid
##    from numpy import arange, linspace, mean, std, zeros, ones
##    from numpy import loadtxt, array
##    import scipy.interpolate
##    ## definition
##    def mylinearregression(x,y,confidence_level):
##        """
##        function to calculate simple linear regression
##        inputs: x,y-data pairs and a confidence level (in percent)
##        outputs: slope = b1, intercept = b0 its confidence intervals
##        and R (+squared)
##        """
##        show_plots = 'on'        # 'on' to show data, fit and conf interval
##        x_average = mean(x)
##        x_stdev = std(x)
##        y_average = mean(y)
##        y_stdev = std(y)
##        n = len(x)
##        # calculate linear regression coefficients
##        b1 = sum((x-x_average)*(y-y_average))/sum((x-x_average)**2) 
##        b0 = y_average - b1 * x_average
##        #sample_correlation = b1*x_stdev/y_stdev
##        # calculate residuals (observed - predicted)
##        TotSS = sum((y-y_average)**2)   # Total Sum of Squares
##        y_hat = b1 * x + b0             # fitted values
##        ResSS = sum((y-y_hat)**2)       # Residual Sum of Squares
##        # calculate standard deviations of fit params
##        b1_stdev = sqrt((ResSS/(n-2))/sum((x-x_average)**2))       
##        b0_stdev = b1_stdev*sqrt(sum(x**2)/n)   
##        # compute the mean square error (variance) and standard error (root of var), R2 and R
##        mserr = ResSS/n-2
##        sterr = sqrt(mserr)
##        R2 = 1 - ResSS/TotSS
##        R = sqrt(R2)
##        # Pearson's r (this is the same as sample_correlation) 
##        #pearsonsr = (sum(x*y)-(sum(x)*sum(y))/n)/sqrt((sum(x**2)-sum(x)**2/n)*(sum(y**2)-sum(y)**2/n))
##        print 'r_square = %.3f ' %(R2)
##        ## calculate confidence interval
##        # alpha
##        alpha = 1.-(confidence_level*1./100.)
##        # degrees of freedom (2 lost by estimates of slope and intercept)
##        DF = n-2
##        # critical value (look up in t table)
##        cv = 1.96
##        # Margin of error = Critical value x Standard error of the statistic
##        moe_b1 = cv * b1_stdev
##        moe_b0 = cv * b0_stdev
##        lower_b1 = b1-moe_b1
##        upper_b1 = b1+moe_b1
##        lower_b0 = b0-moe_b0
##        upper_b0 = b0+moe_b0
##        print ' Report of linear regression:'
##        print ' Slope = %.4f +/- %.4f, Intercept = %.4f +/- %.4f' %(b1, moe_b1, b0, moe_b0)
##        print ' %g percent confidence interval for slope: %.4f to %.4f' %(confidence_level, lower_b1, upper_b1)
##        print ' %g percent confidence interval for intercept: %.4f to %.4f' %(confidence_level, lower_b0, upper_b0)
##        # compute confidence lines for plot
##        lower = upper_b1 * x + lower_b0
##        upper = lower_b1 * x + upper_b0
##        if show_plots == 'on':
##            figure(1000)
##            title('Linear Regression')
##            xlabel('Levenshtein Distance')
##            ylabel('Similarity')
##            grid()
##            z = 4
##            w = 0
##            plot(x,y,'_', label='data', markersize = 2)
##            plot(x,y_hat,'r.-', label='linear fit')
##            plot(x,lower,'c-')
##            plot(x,upper,'c-')
##            plot(z,w,markersize = 0.01)
##            legend(loc='best',numpoints=3)
##            # are the residuals normally distributed?
##            figure(1001)
##            title('Residuals of fit')
##            xlabel('x')
##            ylabel('Residuals')
##            grid()
##            plot(x,y-y_hat,'mo')
##        return b1,b0,lower_b1,upper_b1,lower_b0,upper_b0,R2,R
##    x = t#array(x)
##    y = xn#array(y)
##    print ' Run simple linear regression:'
##    b1,b0,lower_b1,upper_b1,lower_b0,upper_b0,R2,R = mylinearregression(x, y, 95)
##    show()


def save_hidden_activations(t, h, v, samples_num, sequence_length, training_set, gibbs_steps, patterns_save_path):
    ## generate samples and store internal representations of existing words
    ## including all the prefixes representations
    from operator    import itemgetter
    from collections import defaultdict
    f = open(patterns_save_path, 'w')
    print 'Generating samples...'
    max_size    = 10000
    generated   = 0
    #words_dist  = defaultdict(int)
    sampled     = []
    counter     = 0
    counter_tot = 0
    if samples_num < max_size:
        batch = samples_num
    else:
        batch = max_size
    while generated < samples_num:
        V, H = t.W.get_samples_and_hidden(sequence_length, gibbs_steps, batch)
        V = gpu.as_numpy_array(V)
        H = gpu.as_numpy_array(H)
        for i in range(0, batch):
            word = ''.join(words.decodify_word(V[i, :, :]))
            w = word[0:-1]
            if (w in training_set) and (w not in sampled):
                ## save all the hidden patterns and label them with the corresponding word
                length = len(w)
                f.write(w)
                f.write('\n')
                for p in range(0, length):
                    for n in H[i, p, :]:
                        f.write(str(n))
                        f.write(' ')
                    f.write('\n')
                f.write('\n')
                
                # write also visible activations
                for p in range(0, length):
                    for n in V[i, p, :]:
                        f.write(str(n))
                        f.write(' ')
                    f.write('\n')
                f.write('\n\n')
                
                sampled.append(w)
                counter += 1
                #words_dist[w] += 1
        generated += batch
    f.close()
    print 'done.'
    print 'Total unique words generated: ', counter


def collect_activations(t, h, v, samples_num, sequence_length, training_set, gibbs_steps, patterns_save_path, trie, max_patterns, max_freq):
    ## generate samples and store all visible and hidden representations for each position
    from collections import Counter
    from numpy import multiply
    import time
    f_H_r = open(patterns_save_path+'_Hr', 'w')
    f_V_r = open(patterns_save_path+'_Vr', 'w')
    f_H_b = open(patterns_save_path+'_Hb', 'w')
    f_V_b = open(patterns_save_path+'_Vb', 'w')
    print 'Generating samples...'
    max_size    = 20000
    generated   = 0
    count_found = 0
    count_not_found = 0
    tr = []
    patt_counter = Counter()
    if samples_num < max_size:
        batch = samples_num
    else:
        batch = max_size
    x1 = time.strftime('%s')
    while generated < samples_num:
        V_r, H_r, V_b, H_b = t.W.get_samples_and_hidden(sequence_length, gibbs_steps, batch, real_and_bin = True)
        V_r = gpu.as_numpy_array(V_r)
        H_r = gpu.as_numpy_array(H_r)
        V_b = gpu.as_numpy_array(V_b)
        H_b = gpu.as_numpy_array(H_b)
        for i in range(0, batch):
            word = ''.join(words.decodify_word(V_r[i, :, :]))
            w = word#[0:-1]
            length = len(w)

            found = trie.search(w)
            if (found and count_found < max_patterns and patt_counter[w] < max_freq) \
                or (not(found) and count_not_found < (max_patterns * 5) and patt_counter[w] < max_freq) \
                or (found and (w not in tr)):
            
                f_H_r.write(w)
                f_H_r.write('\n')
                f_V_r.write(w)
                f_V_r.write('\n')
                f_H_b.write(w)
                f_H_b.write('\n')
                f_V_b.write(w)
                f_V_b.write('\n')

                # write real-valued hidden activations
                for p in range(0, length):
    ##                for n in H_r[i, p, :]:
    ##                    f.write(' ')
    ##                    f.write('%.7f' % n)
                    arr = multiply(H_r[i, p, :], 10000)
                    arr.tofile(f_H_r, sep=' ', format='%d')
                    #arr.astype('int16').tofile(f)
                    f_H_r.write('\n')
                
                # write real-valued visible activations
                for p in range(0, length):
                    arr = multiply(V_r[i, p, :], 10000)
                    arr.tofile(f_V_r, sep=' ', format='%d')
                    f_V_r.write('\n')
                
                # write binary-valued hidden activations
                for p in range(0, length):
                    for n in H_b[i, p, :]:
                        f_H_b.write(str(int(n)))
                        f_H_b.write(' ')
                    f_H_b.write('\n')

                # write binary-valued visible activations
                for p in range(0, length):
                    for n in V_b[i, p, :]:
                        f_V_b.write(str(int(n)))
                        f_V_b.write(' ')
                    f_V_b.write('\n')
                
                f_H_r.write('\n')
                f_V_r.write('\n')
                f_H_b.write('\n')
                f_V_b.write('\n')
                
                if found:
                    tr.append(w)
                    count_found += 1
                else:
                    count_not_found += 1
                patt_counter[w] += 1

        generated += batch
        if generated % (100*batch) == 0:
            print generated
    x2 = time.strftime('%s')
    timediff = int(x2) - int(x1)
    print 'Generation complete. It took ', timediff, ' seconds.'
    f_H_r.close()
    f_V_r.close()
    f_H_b.close()
    f_V_b.close()
    print 'done.'
    print 'Total unique words generated: ', len(set(tr))
    print 'Total patterns of words: ', len(tr)
    print 'Total patterns saved: ', sum(patt_counter.values()) 


def prepare_classification_data(patterns_save_path, trie):
    ## load patterns, analyse sequences and create the corresponding dataset with labels
    f = open(patterns_save_path+'_Hr', 'r')
    #f = open('../r_data/patterns/patterns_prova', 'r')
    fc = open(patterns_save_path+'_Hr_classify', 'w')
    #f = open(patterns_save_path+'_Vr', 'r')
    #fc = open(patterns_save_path+'_Vr_classify', 'w')
    #f = open(patterns_save_path+'_Hb', 'r')
    #fc = open(patterns_save_path+'_Hb_classify', 'w')
    #f = open(patterns_save_path+'_Vb', 'r')
    #fc = open(patterns_save_path+'_Vb_classify', 'w')
    positive = []
    positive_w = []
    negative_tr = []
    negative_tr_w = []
    negative_te = []
    negative_te_w = []
    while True:
        l = f.readline()
        if not l:
            break
        w = l[0:-1]
        length = len(w)
        position = trie.search_also_prefixes(w)
        if length == position:
            ## existing word: insert all positive patterns (one for each position)
            for i in range(0, length):
                p = f.readline()
                positive.append(p[0:-1])
                positive_w.append(w)
        else:
            ## illegal word: insert one negative pattern (the one corresponding to the wrong activations)
            for i in range(0, position + 1):
                p = f.readline()
            negative_tr.append(p[0:-1])
            negative_tr_w.append(w)
            for i in range (position + 1, length):
                p = f.readline()
                ## NEW: insert also the following patterns as negative instances to use in the test set
                negative_te.append(p[0:-1])
                negative_te_w.append(w)
        skip = f.readline()
    fc.write(str(len(positive)) + '\n')
    for patt in range(0, len(positive)):
        fc.write(positive_w[patt] + '\n' + positive[patt] + '\n')
    fc.write(str(len(negative_tr)) + '\n')
    for patt in range(0, len(negative_tr)):
        fc.write(negative_tr_w[patt] + '\n' + negative_tr[patt] + '\n')
    fc.write(str(len(negative_te)) + '\n')
    for patt in range(0, len(negative_te)):
        fc.write(negative_te_w[patt] + '\n' + negative_te[patt] + '\n')

    f.close()
    fc.close()


def train_linear_classifier(patterns_save_path):
    ## load data and train a perceptron
    import numpy as np
    f = open(patterns_save_path+'_Hr_classify', 'r')
    print 'Real hidden:\n'
    #header = f.readline()
    labels   = []
    words    = []
    patterns_pos = []
    patterns_neg_tr = []
    patterns_neg_te = []
    tot_pos = int(f.readline())
    for i in range(0, tot_pos):
        words.append(f.readline())
        line = np.fromstring(f.readline(), dtype = float, sep = ' ')
        patterns_pos.append(line/10000)
        labels.append(1)
    tot_neg_tr = int(f.readline())
    for i in range(0, tot_neg_tr):
        words.append(f.readline())
        line = np.fromstring(f.readline(), dtype = float, sep = ' ')
        patterns_neg_tr.append(line/10000)
        labels.append(0)
    tot_neg_te = int(f.readline())
    for i in range(0, tot_neg_te):
        words.append(f.readline())
        line = np.fromstring(f.readline(), dtype = float, sep = ' ')
        patterns_neg_te.append(line/10000)
        labels.append(0)
    
    words = np.array(words)
    labels = np.array(labels)
    patterns_pos = np.array(patterns_pos)
    patterns_neg_tr = np.array(patterns_neg_tr)
    patterns_neg_te = np.array(patterns_neg_te)
    #print patterns.shape
    #print labels.shape
    
    ## sample n patterns for training and m for testing
    n = 50000
    m = 50000
    #idx  = np.random.permutation(labels.shape[0])
    print patterns_pos.shape
    print patterns_neg_tr.shape
    print patterns_neg_te.shape
    print labels.shape
    print '\n'
    Data_tr = np.concatenate((patterns_pos, patterns_neg_tr))
    idx_tr  = np.random.permutation(Data_tr.shape[0])
    D_tr    = Data_tr[idx_tr[0:n]]
    L_tr    = labels[idx_tr[0:n]]
    
    Data_te = np.concatenate((Data_tr[idx_tr[n:]], patterns_neg_te))
    Labels  = np.concatenate((labels[idx_tr[n:]], labels[Data_tr.shape[0]:]))
    print np.max(Labels[Data_tr.shape[0]:])
    print Labels.shape
    print Data_te.shape
    idx_te  = np.random.permutation(Data_te.shape[0])
    D_te    = Data_te[idx_te[0:m]]
    L_te    = Labels[idx_te[0:m]]

##    print D_tr
##    print L_tr
##    print D_te
##    print L_te
    
##    idx  = np.random.permutation(patterns_pos.shape[0])
##    D_tr = np.concatenate(patterns_pos[idx[0:n]], patterns_neg_tr[idx[0:n]])
##    L_tr = np.concatenate(np.ones(n), np.zeros(n))
##    D_te = np.concatenate(patterns_pos[idx[0:n]], patterns_neg_tr[idx[0:n]])
##    D_te = patterns[idx[n:n+m]]
##    L_te = labels[idx[n:n+m]]
    
    W = np.linalg.lstsq(D_tr, L_tr)[0]               # one-shot weights learning
    #W = np.random.rand(27)
    P_tr = np.around(np.dot(D_tr, W))                # predictions on training set
    wrong_predictions_tr = np.count_nonzero(L_tr - P_tr)
    P_te = np.around(np.dot(D_te, W))                # predictions on test set
    wrong_predictions_te = np.count_nonzero(L_te - P_te)
    print 'Training set errors: %.3f' % (wrong_predictions_tr / float(n))
    print 'Test set errors: %.3f' % (wrong_predictions_te / float(m))

















def compare_distributed_representations(t, h, samples_num, sequence_length, gibbs_steps, context_tree):
    ## compare hidden units representations to analyse their distribution
    labels = {}
    unique_words = []
    unique_activations = []
    j = u = 0
    ## generate all the samples required and select only existing words
    print 'Generating samples...'
    max_size = 10000
    generated = 0
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, activation_matrix = t.W.get_samples_and_last_hidden(sequence_length, gibbs_steps, batch)
        generated += batch
        for i in range(0, batch):
            w = ''.join(words.decodify_word(x[i, :, :]))
            if context_tree.search(w) == True:
                j += 1
                if w not in unique_words:
                    unique_activations.append((w, activation_matrix[i]))
                    unique_words.append(w)
                    u += 1
    print 'done.'
    x = gpu.as_numpy_array(x)
    activation_matrix = gpu.as_numpy_array(activation_matrix)
    print 'Total matches: ', j
    print 'Total unique matches: ', u
    ## eventually create a square matrix that contains representations comparison
    sorted_words = []
    unique_activations = sorted(unique_activations)
    for k in range(len(unique_activations)):
        sorted_words.append(unique_activations[k][0])
    similarity_matrix = bas.compute_similarity(unique_activations)
    bas.plot_similarity_matrix(similarity_matrix, sorted_words)
    for i in range(len(unique_activations)):
        for j in range(len(unique_activations)):
            if j >= i:
                break
            if (similarity_matrix[i, j] < 0.03):
                print sorted_words[i], '\t', sorted_words[j], '\t',
                print '\t%.3f' % similarity_matrix[i, j]
    print '\nAverage similarity: ', similarity_matrix.mean()


def analyse_representations_dynamic(t, h, v, samples_num, sequence_length, training_set, gibbs_steps):
    ## generate samples and store only desired internal representations, including all the prefixes representations
    print 'Generating samples...'
    max_size = 10000
    generated = 0
    sampled = []
    prefixes_similarities = {}
    if samples_num < max_size:
        batch = samples_num
    else:
        batch = max_size
    while generated < samples_num:
        V, H = t.W.get_samples_and_hidden(sequence_length, gibbs_steps, batch)
        V = gpu.as_numpy_array(V)
        H = gpu.as_numpy_array(H)
        for i in range(0, batch):
            w = ''.join(words.decodify_word(V[i, :, :]))
            w = w[0:-1]
            if (w in training_set) and (w not in sampled):
                length = len(w)
                if length == 5:
                    dist = np.zeros(length)
                    w_representation = H[i, length - 1, :]
                    ## compute similarity between w and all its prefixes:
                    for p in range(0, length):
                        p_representation = H[i, p, :]
                        dist[p] = bas.compute_activations_similarity(w_representation, p_representation, 'L2')
                    prefixes_similarities[w] = dist
                    sampled.append(w)
        generated += batch
    print 'done.'
    ## print all the distances, in order to allow an easy plotting with a spreadsheet
    for w in sampled:
        print '\n', w, ' ',
        for p in prefixes_similarities[w]:
            print '%.5f' % p,


def plot_weights(W):
    from   pylab  import *
    v, h   = W.w[0].w[0][0].shape
    h1, h2 = W.w[1].w[0][0].shape
    print v, h, h1, h2
    ## consider to normalize to improve readability
    fig1 = figure()
    fig1.suptitle('HH matrix', fontsize = 18)
    ax1 = fig1.add_subplot(1,1,1)
    cax1 = ax1.imshow(W.w[1].w[0][0], cmap = matplotlib.cm.jet, interpolation = 'nearest')
    fig2 = figure()
    fig2.suptitle('VH matrix', fontsize = 18)
    ax2 = fig2.add_subplot(1,1,1)
    cax2 = ax2.imshow(W.w[0].w[0][0], cmap = matplotlib.cm.jet, interpolation = 'nearest')
    show()











def activations_histogram(t, h, samples_num, sequence_length, gibbs_steps, pattern):
    letter = pattern
    print 'Generating samples...'
    label_c = letter + ' as context'
    letters_activations = {}
    freq = {}
    matrix_c = np.zeros((10, h)) ## discriminate first 10 positions (will be summed out in this case)
    tot_freq = np.zeros(10)
    for l in alphabet:
        letters_activations[l] = gpu.zeros(h)
        freq[l] = 0
    max_size = 10000
    generated = 0
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, letters_activations, freq, matrix_c, tot_freq = t.W.get_samples_and_analyse_context(sequence_length, gibbs_steps, batch, dictionary = letters_activations, frequencies = freq, m = matrix_c, tot_freq = tot_freq, pattern = letter)
        generated += batch
    print 'done.'
    ## normalize activations
    for l in letters_activations:
        if freq[l] != 0:
            letters_activations[l] /= freq[l]
    total_global = tot_freq.sum()
    final_values = matrix_c.sum(0)
    final_values /= total_global
    bas.show_letter_activations_histogram(letter, values = final_values)


def activations_matrix_avg(t, h, samples_num, sequence_length, gibbs_steps):
    print 'Generating samples...'
    letters_activations = {}
    freq = {}
    matrix_p = np.zeros((10, h)) ## discriminate positions
    tot_freq = np.zeros(10)
    for l in alphabet:
        letters_activations[l] = gpu.zeros(h)
        freq[l] = 0
    max_size = 10000
    generated = 0
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, letters_activations, freq, matrix_p, tot_freq = t.W.get_samples_and_analyse_avg(sequence_length, gibbs_steps, batch, dictionary = letters_activations, frequencies = freq, m = matrix_p, tot_freq = tot_freq)
        generated += batch
    print 'done.'
    ## normalize activations
    for letter in letters_activations:
        if freq[letter] != 0:
            letters_activations[letter] /= freq[letter]       
    for i in range(0, 10):
        if tot_freq[i] != 0:
            matrix_p[i] /= tot_freq[i]
    bas.show_complete_activations_matrix(values = matrix_p, label = 'Average activations')
    return matrix_p


def plot_neurons_specific_activations(t, h, samples_num, sequence_length, gibbs_steps, context_tree):
    ### outline how activations evolve in time for specific patterns
    slope_threshold = 0.23
    intercept_threshold = 0.6
    labels = {}
    unique_words = {}
    ## first generate all the samples required
    print 'Generating samples...'
    max_size = 10000
    generated = 0
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, activation_matrix = t.W.get_samples_and_hidden(sequence_length, gibbs_steps, batch)
        generated += batch
    print 'done.'
    x = gpu.as_numpy_array(x)
    activation_matrix = gpu.as_numpy_array(activation_matrix)
    ## than select only existing words
    j = k = 0
    for i in range(0, samples_num):
        w = ''.join(words.decodify_word(x[i, :, :]))
        if context_tree.search(w) == False:
            activation_matrix = np.delete(activation_matrix, i - k, 0)
            k += 1
        else:
            labels[j] = w
            unique_words[w] = w
            j += 1
        if i % 1000 == 0:
            print i
    print 'Unique words found: ', unique_words
    print 'Total matches: ', j
    print 'Total unique matches: ', len(unique_words)
    ## eventually apply linear regression to analyse activation trends
    response_list = bas.linear_regression(gpu.as_numpy_array(activation_matrix), labels, slope_threshold, intercept_threshold)
    for n in response_list:
        print n, ':', set(response_list[n])


def analyse_position_sensitivity(t, h, samples_num, sequence_length, gibbs_steps, context_tree):
    ### outline how activations evolve in time according to sequences lengths
    slope_threshold = 0.02
    corr_threshold  = 0.2
    labels = {}
    unique_words = {}
    ## first generate all the samples required
    ## and differentiate according to lengths
    activation_matrix_3 = np.zeros((sequence_length, h))
    activation_matrix_4 = np.zeros((sequence_length, h))
    activation_matrix_5 = np.zeros((sequence_length, h))
    activation_matrix_6 = np.zeros((sequence_length, h))
    activation_matrix_7 = np.zeros((sequence_length, h))
    c3 = c4 = c5 = c6 = c7 = 0
    j = k = 0
    print 'Generating samples...'
    max_size = 20000
    generated = 0
    idx_track_4 = []
    idx_track_5 = []
    idx_track_6 = []
    idx_track_7 = []
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, activation_matrix = t.W.get_samples_and_hidden(sequence_length, gibbs_steps, batch)
        generated += batch
        x = gpu.as_numpy_array(x)
        activation_matrix = gpu.as_numpy_array(activation_matrix)
        for i in range(batch):
            w = ''.join(words.decodify_word(x[i, :, :]))
            if   len(w) == 3:
                 activation_matrix_3 += activation_matrix[i, :, :]
                 c3 += 1
            elif len(w) == 4:
                 activation_matrix_4 += activation_matrix[i, :, :]
                 idx_track_4.append(i)
                 c4 += 1
            elif len(w) == 5:
                 activation_matrix_5 += activation_matrix[i, :, :]
                 idx_track_5.append(i)
                 c5 += 1
            elif len(w) == 6:
                 activation_matrix_6 += activation_matrix[i, :, :]
                 idx_track_6.append(i)
                 c6 += 1
            elif len(w) == 7:
                 activation_matrix_7 += activation_matrix[i, :, :]
                 idx_track_7.append(i)
                 c7 += 1
    print 'done.'
    
    ## average activations for the same lengths
    activation_matrix_3 /= c3
    activation_matrix_4 /= c4
    activation_matrix_5 /= c5
    activation_matrix_6 /= c6

    ## apply linear regression to analyse activation trends
    from scipy import linspace, polyval, polyfit, sqrt, stats, randn
    from pylab import plot, title, show , legend
    from matplotlib import pyplot as PLT
    import scipy.optimize as opt
    def func(p,x):
        w=p[1]*np.exp(p[0]*x)
        print w.size
        return w
    def residuals(p,x,y):
        w=p[1]*np.exp(p[0]*x)
        err=w-y
        err=err**2
        B=sum(err)
        return B
    p0=[0.3,2.]

    #N = np.array([1, 6, 15, 20, 21, 29, 31, 32, 36, 37, 41, 46, 54, 65, 85, 89, 93, 95, 103, 105])
    #N = np.array([20, 31, 95])
    N = np.array([95])
    
##    neur = 21
##    t  = np.zeros(samples_num * 8)
##    xn = np.zeros(samples_num * 8)
##    xn[0:samples_num]               = activation_matrix[:, 0, neur]
##    t[0:samples_num]                = 0
##    xn[samples_num:samples_num*2]   = activation_matrix[:, 1, neur]
##    t[samples_num:samples_num*2]    = 1
##    xn[samples_num*2:samples_num*3] = activation_matrix[:, 2, neur]
##    t[samples_num*2:samples_num*3]  = 2
##    xn[samples_num*3:samples_num*4] = activation_matrix[:, 3, neur]
##    t[samples_num*3:samples_num*4]  = 3
##    xn[samples_num*4:samples_num*5] = activation_matrix[:, 4, neur]
##    t[samples_num*4:samples_num*5]  = 4
##    xn[samples_num*5:samples_num*6] = activation_matrix[:, 5, neur]
##    t[samples_num*5:samples_num*6]  = 5
##    xn[samples_num*6:samples_num*7] = activation_matrix[:, 6, neur]
##    t[samples_num*6:samples_num*7]  = 6
##    xn[samples_num*7:samples_num*8] = activation_matrix[:, 7, neur]
##    t[samples_num*7:samples_num*8]  = 7
##    fig = PLT.figure()
##    sub_a = fig.add_subplot(1,1,1)
##    sub_a.plot(t, xn, 'r.', marker = '_', markersize = 0.4)
##    t_avg = range(0, 8)
##    xn_avg = np.zeros(8)
##    for i in range(0, 8):
##        xn_avg[i] = np.mean(activation_matrix[:, i, neur])
##    sub_a.plot(t_avg, xn_avg, 'r.', marker = 'o', markersize = 5)
##    PLT.show()

    neur = 95
    idx_track_4 = np.array(idx_track_4)
    activation_matrix_l4 = activation_matrix[idx_track_4, :, :]
    t  = np.zeros(c4 * 8)
    xn = np.zeros(c4 * 8)
    xn[0:c4]      = activation_matrix_l4[:, 0, neur]
    t[0:c4]       = 0
    xn[c4:c4*2]   = activation_matrix_l4[:, 1, neur]
    t[c4:c4*2]    = 1
    xn[c4*2:c4*3] = activation_matrix_l4[:, 2, neur]
    t[c4*2:c4*3]  = 2
    xn[c4*3:c4*4] = activation_matrix_l4[:, 3, neur]
    t[c4*3:c4*4]  = 3
    xn[c4*4:c4*5] = activation_matrix_l4[:, 4, neur]
    t[c4*4:c4*5]  = 4
    xn[c4*5:c4*6] = activation_matrix_l4[:, 5, neur]
    t[c4*5:c4*6]  = 5
    xn[c4*6:c4*7] = activation_matrix_l4[:, 6, neur]
    t[c4*6:c4*7]  = 6
    xn[c4*7:c4*8] = activation_matrix_l4[:, 7, neur]
    t[c4*7:c4*8]  = 7
    t_reg  = t[c4:c4*4]
    xn_reg = xn[c4:c4*4]
    plsq = opt.fmin(residuals, p0, args=(t_reg,xn_reg), maxiter=10000, maxfun=10000)
    #plsq = opt.leastsq(residuals, p0, args=(x,y))
    print plsq[0], plsq[1]
    xrange = np.linspace(1, 3, 10000)
    y2=func(plsq,xrange)
    fig = PLT.figure()
    sub_4 = fig.add_subplot(1,1,1)
    sub_4.plot(t, xn, 'r.', marker = '_', markersize = 0.4)
    sub_4.plot(xrange, y2,'b-')
    PLT.show()

    idx_track_5 = np.array(idx_track_5)
    activation_matrix_l5 = activation_matrix[idx_track_5, :, :]
    x_l5 = x[idx_track_5, :, :]
    t  = np.zeros(c5 * 8)
    xn = np.zeros(c5 * 8)
    xn[0:c5]      = activation_matrix_l5[:, 0, neur]
    t[0:c5]       = 0
    xn[c5:c5*2]   = activation_matrix_l5[:, 1, neur]
    t[c5:c5*2]    = 1
    xn[c5*2:c5*3] = activation_matrix_l5[:, 2, neur]
    t[c5*2:c5*3]  = 2
    xn[c5*3:c5*4] = activation_matrix_l5[:, 3, neur]
    t[c5*3:c5*4]  = 3
##    high_last = []
##    low_last = []
##    high_last_final = []
##    low_last_final = []
##    for w in range(0, len(idx_track_5)):
##        word = ''.join(words.decodify_word(x_l5[w, :, :]))
##        if activation_matrix_l5[w, 3, neur] > 0.9:
##            high_last.append(word[0:-1])
##        elif activation_matrix_l5[w, 3, neur] < 0.1:
##            low_last.append(word[0:-1])
##    print '\nHIGH LAST:\n'
##    for w in high_last:
##        print w, '\t',
##        high_last_final.append(w[-1])
##    print '\n\nLOW LAST:\n'
##    for w in low_last:
##        print w, '\t',
##        low_last_final.append(w[-1])
##
##    #high_last_final = set(high_last_final)
##    #low_last_final = set(low_last_final)
##    d = {}
##    for i in set(high_last_final):
##        d[i] = high_last_final.count(i)
##    print '\n', d, '\n'
##    dd = {}
##    for i in set(low_last_final):
##        dd[i] = low_last_final.count(i)
##    print '\n', dd, '\n'
    xn[c5*4:c5*5] = activation_matrix_l5[:, 4, neur]
    t[c5*4:c5*5]  = 4
    xn[c5*5:c5*6] = activation_matrix_l5[:, 5, neur]
    t[c5*5:c5*6]  = 5
    xn[c5*6:c5*7] = activation_matrix_l5[:, 6, neur]
    t[c5*6:c5*7]  = 6
    xn[c5*7:c5*8] = activation_matrix_l5[:, 7, neur]
    t[c5*7:c5*8]  = 7
    t_reg  = t[c5:c5*5]
    xn_reg = xn[c5:c5*5]
    plsq = opt.fmin(residuals, p0, args=(t_reg,xn_reg), maxiter=10000, maxfun=10000)
    #plsq = opt.leastsq(residuals, p0, args=(x,y))
    print plsq[0], plsq[1]
    xrange = np.linspace(1, 4, 10000)
    y2=func(plsq,xrange)
    fig = PLT.figure()
    sub_5 = fig.add_subplot(1,1,1)
    sub_5.plot(t, xn, 'r.', marker = '_', markersize = 0.4)
    sub_5.plot(xrange, y2,'b-')
    PLT.show()

    idx_track_6 = np.array(idx_track_6)
    activation_matrix_l6 = activation_matrix[idx_track_6, :, :]
    x_l6 = x[idx_track_6, :, :]
    t  = np.zeros(c6 * 8)
    xn = np.zeros(c6 * 8)
    xn[0:c6]      = activation_matrix_l6[:, 0, neur]
    t[0:c6]       = 0
    xn[c6:c6*2]   = activation_matrix_l6[:, 1, neur]
    t[c6:c6*2]    = 1
    xn[c6*2:c6*3] = activation_matrix_l6[:, 2, neur]
    t[c6*2:c6*3]  = 2
    xn[c6*3:c6*4] = activation_matrix_l6[:, 3, neur]
    t[c6*3:c6*4]  = 3
    xn[c6*4:c6*5] = activation_matrix_l6[:, 4, neur]
    t[c6*4:c6*5]  = 4
    high_last = []
    low_last = []
    high_last_final = []
    low_last_final = []
    for w in range(0, len(idx_track_6)):
        word = ''.join(words.decodify_word(x_l6[w, :, :]))
        if activation_matrix_l6[w, 4, neur] > 0.9:
            high_last.append(word[0:-1])
        elif activation_matrix_l6[w, 4, neur] < 0.1:
            low_last.append(word[0:-1])
    print '\nHIGH LAST:\n'
    for w in high_last:
        print w, '\t',
        high_last_final.append(w[-1])
    print '\n\nLOW LAST:\n'
    for w in low_last:
        print w, '\t',
        low_last_final.append(w[-1])

    #high_last_final = set(high_last_final)
    #low_last_final = set(low_last_final)
    d = {}
    for i in set(high_last_final):
        d[i] = high_last_final.count(i)
    print '\n', d, '\n'
    dd = {}
    for i in set(low_last_final):
        dd[i] = low_last_final.count(i)
    print '\n', dd, '\n'
    
    xn[c6*5:c6*6] = activation_matrix_l6[:, 5, neur]
    t[c6*5:c6*6]  = 5
    xn[c6*6:c6*7] = activation_matrix_l6[:, 6, neur]
    t[c6*6:c6*7]  = 6
    xn[c6*7:c6*8] = activation_matrix_l6[:, 7, neur]
    t[c6*7:c6*8]  = 7
    t_reg  = t[c6:c6*6]
    xn_reg = xn[c6:c6*6]
    plsq = opt.fmin(residuals, p0, args=(t_reg,xn_reg), maxiter=10000, maxfun=10000)
    #plsq = opt.leastsq(residuals, p0, args=(x,y))
    print plsq[0], plsq[1]
    xrange = np.linspace(1, 5, 10000)
    y2=func(plsq,xrange)
    fig = PLT.figure()
    sub_6 = fig.add_subplot(1,1,1)
    sub_6.plot(t, xn, 'r.', marker = '_', markersize = 0.4)
    sub_6.plot(xrange, y2,'b-')
    PLT.show()

    idx_track_7 = np.array(idx_track_7)
    activation_matrix_l7 = activation_matrix[idx_track_7, :, :]
    t  = np.zeros(c7 * 9)
    xn = np.zeros(c7 * 9)
    xn[0:c7]      = activation_matrix_l7[:, 0, neur]
    t[0:c7]       = 0
    xn[c7:c7*2]   = activation_matrix_l7[:, 1, neur]
    t[c7:c7*2]    = 1
    xn[c7*2:c7*3] = activation_matrix_l7[:, 2, neur]
    t[c7*2:c7*3]  = 2
    xn[c7*3:c7*4] = activation_matrix_l7[:, 3, neur]
    t[c7*3:c7*4]  = 3
    xn[c7*4:c7*5] = activation_matrix_l7[:, 4, neur]
    t[c7*4:c7*5]  = 4
    xn[c7*5:c7*6] = activation_matrix_l7[:, 5, neur]
    t[c7*5:c7*6]  = 5
    xn[c7*6:c7*7] = activation_matrix_l7[:, 6, neur]
    t[c7*6:c7*7]  = 6
    xn[c7*7:c7*8] = activation_matrix_l7[:, 7, neur]
    t[c7*7:c7*8]  = 7
    xn[c7*8:c7*9] = activation_matrix_l7[:, 8, neur]
    t[c7*8:c7*9]  = 8
    t_reg  = t[c7:c7*7]
    xn_reg = xn[c7:c7*7]
    plsq = opt.fmin(residuals, p0, args=(t_reg,xn_reg), maxiter=10000, maxfun=10000)
    #plsq = opt.leastsq(residuals, p0, args=(x,y))
    print plsq[0], plsq[1]
    xrange = np.linspace(1, 6, 10000)
    y2=func(plsq,xrange)
    fig = PLT.figure()
    sub_7 = fig.add_subplot(1,1,1)
    sub_7.plot(t, xn, 'r.', marker = '_', markersize = 6)
    sub_7.plot(xrange, y2,'b-')
    PLT.show()

    
##    t = range(0, 8)
##    for n in N:
##        #if activation_matrix_3[1, n] < 0.9 and activation_matrix_3[2, n] < 0.9 and activation_matrix_3[3, n] < 0.9:
##        t_3 = range(1, 3)
##        xn_3 = activation_matrix_3[1:3, n]
##        xn = activation_matrix_3[0:8, n]
##        (ar, br) = polyfit(t_3, xn_3, 1)
##        #correlation = np.corrcoef(t_3, xn_3)[0,1]
##        if True:#(ar > slope_threshold and correlation > corr_threshold) or (ar < (0-slope_threshold) and correlation > corr_threshold):
##            xr = polyval([ar, br], t)
##            fig = PLT.figure()
##            ax = fig.add_subplot(1,1,1)
##            PLT.title('Length 3\nNeuron: ' + str(n))
##            ax.plot(t, xn, 'r.', linewidth = 0.5, markersize = 10) # t, xr, 'b--',
##            PLT.xlim( (0, 8)   )
##            PLT.ylim( (0, 1) )
##            PLT.show()
##    for n in N:
##        #if activation_matrix_4[1, n] < 0.9 and activation_matrix_4[2, n] < 0.9 and activation_matrix_4[3, n] < 0.9:
##        t_4 = range(1, 4)
##        xn_4 = activation_matrix_4[1:4, n]
##        xn = activation_matrix_4[0:8, n]
##        c = polyfit(t_4, xn_4, 2)
##        #correlation = np.corrcoef(t_4, xn_4)[0,1]
##        if True:#(ar > slope_threshold and correlation > corr_threshold) or (ar < (0-slope_threshold) and correlation > corr_threshold):
##            t_lot = [0.1* i for i in range (80)]
##            xr = polyval(c, t_lot)
##            fig = PLT.figure()
##            ax = fig.add_subplot(1,1,1)
##            PLT.title('Length 4\nNeuron: ' + str(n))
##            ax.plot(t, xn, 'r.', linewidth = 0.5, markersize = 10) # t_lot, xr, 'b--',
##            PLT.xlim( (0, 8)   )
##            PLT.ylim( (0, 1) )
##            PLT.show()
##    for n in N:
##        #if activation_matrix_5[1, n] < 0.9 and activation_matrix_5[2, n] < 0.9 and activation_matrix_5[3, n] < 0.9:
##        t_5 = range(1, 5)
##        xn_5 = activation_matrix_5[1:5, n]
##        xn = activation_matrix_5[0:8, n]
##        (ar, br, cr) = polyfit(t_5, xn_5, 2)
##        #correlation = np.corrcoef(t_5, xn_5)[0,1]
##        if True:#(ar > slope_threshold and correlation > corr_threshold) or (ar < (0-slope_threshold) and correlation > corr_threshold):
##            t_lot = [0.1* i for i in range (80)]
##            xr = polyval([ar, br, cr], t_lot)
##            fig = PLT.figure()
##            ax = fig.add_subplot(1,1,1)
##            PLT.title('Length 5\nNeuron: ' + str(n))
##            ax.plot(t, xn, 'r.', t_lot, xr, 'b--', linewidth = 0.5, markersize = 10)
##            PLT.xlim( (0, 8)   )
##            PLT.ylim( (0, 1) )
##            PLT.show()
##    for n in N:
##        #if activation_matrix_6[1, n] < 0.9 and activation_matrix_6[2, n] < 0.9 and activation_matrix_6[3, n] < 0.9:
##        t_6 = range(1, 6)
##        xn_6 = activation_matrix_6[1:6, n]
##        xn = activation_matrix_6[0:8, n]
##        (ar, br, cr) = polyfit(t_6, xn_6, 2)
##        #correlation = np.corrcoef(t_6, xn_6)[0,1]
##        if True:#(ar > slope_threshold and correlation > corr_threshold) or (ar < (0-slope_threshold) and correlation > corr_threshold):
##            t_lot = [0.1* i for i in range (80)]
##            xr = polyval([ar, br, cr], t_lot)
##            fig = PLT.figure()
##            ax = fig.add_subplot(1,1,1)
##            PLT.title('Length 6\nNeuron: ' + str(n))
##            ax.plot(t, xn, 'r.', t_lot, xr, 'b--', linewidth = 0.5, markersize = 10)
##            PLT.xlim( (0, 8)   )
##            PLT.ylim( (0, 1) )
##            PLT.show()

    #print('\na=%.2f\tb=%.2f' % (ar, br))
    #print 'r-square: ', correlation, '\n'




##### ANALYSIS ON PARTICULAR PATTERNS (e.g. 'th' vs 'wh') OR SPECIFIC RULES

def activations_matrix_for_predictions(t, h, samples_num, sequence_length, gibbs_steps, pattern):
    print 'Generating samples...',
    label_p = pattern + ' as prediction'
    letters_activations = {}
    freq = {}
    matrix_p = np.zeros((10, h)) ## discriminate positions
    tot_freq = np.zeros(10)
    for l in alphabet:
        letters_activations[l] = gpu.zeros(h)
        freq[l] = 0
    max_size = 10000
    generated = 0
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, letters_activations, freq, matrix_p, tot_freq = t.W.get_samples_and_analyse_prediction(sequence_length, gibbs_steps, batch, dictionary = letters_activations, frequencies = freq, m = matrix_p, tot_freq = tot_freq, pattern = pattern)
        generated += batch
    print 'done.'

    ## normalize activations
    for letter in letters_activations:
        if freq[letter] != 0:
            letters_activations[letter] /= freq[letter]

    ## discover hidden neurons rensponse preferences
    threshold = 0.7
    neuron_list_pred = []
    for n in range(h):
        neuron_list_pred.append([])
    for l in letters_activations:
        neurons_idx = np.nonzero(letters_activations[l] > threshold)
        for i in neurons_idx[0]:
            neuron_list_pred[i].append(l)
            
    for i in range(0, 10):
        if tot_freq[i] != 0:
            matrix_p[i] /= tot_freq[i]

    return matrix_p, label_p, neuron_list_pred

            
def activations_matrix_for_contexts(t, h, samples_num, sequence_length, gibbs_steps, pattern):
    print 'Generating samples...',
    label_c = pattern + ' as context'
    letters_activations = {}
    freq = {}
    matrix_c = np.zeros((10, h)) ## discriminate first 10 positions
    tot_freq = np.zeros(10)
    for l in alphabet:
        letters_activations[l] = gpu.zeros(h)
        freq[l] = 0
    max_size = 10000
    generated = 0
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, letters_activations, freq, matrix_c, tot_freq = t.W.get_samples_and_analyse_context(sequence_length, gibbs_steps, batch, dictionary = letters_activations, frequencies = freq, m = matrix_c, tot_freq = tot_freq, pattern = pattern)
        generated += batch
    print 'done.'

    ## normalize activations
    for letter in letters_activations:
        if freq[letter] != 0:
            letters_activations[letter] /= freq[letter]

    ## discover hidden neurons response preferences
    threshold = 0.7
    neuron_list_ctx = []
    for n in range(h):
        neuron_list_ctx.append([])
    for l in letters_activations:
        neurons_idx = np.nonzero(letters_activations[l] > threshold)
        for i in neurons_idx[0]:
            neuron_list_ctx[i].append(l)
    
    for i in range(0, 10):
        if tot_freq[i] != 0:
            matrix_c[i] /= tot_freq[i]
    #bas.show_complete_activations_matrix(values = matrix_c)
    return matrix_c, label_c, neuron_list_ctx


def activations_matrix_for_rules(t, h, samples_num, sequence_length, gibbs_steps, pattern):
    print 'Generating samples...',
    label_r = pattern + ' as rule'
    letters_activations = {}
    freq = {}
    matrix_r = np.zeros((10, h)) ## discriminate first 10 positions
    tot_freq = np.zeros(10)
    for l in alphabet:
        letters_activations[l] = gpu.zeros(h)
        freq[l] = 0
    max_size = 10000
    generated = 0
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, letters_activations, freq, matrix_r, tot_freq, track = t.W.get_samples_and_analyse_rule(sequence_length, gibbs_steps, batch, dictionary = letters_activations, frequencies = freq, m = matrix_r, tot_freq = tot_freq, pattern = pattern)
        generated += batch
    print 'done.'
    for i in range(0, 10):
        if tot_freq[i] != 0:
            matrix_r[i] /= tot_freq[i]
##    print '\nWords involved: '
##    for j in track:
##        w = ''.join(words.decodify_word(x[j]))
##        print w
##    print 'Total words: ', len(track)
    #bas.show_complete_activations_matrix(values = matrix_r)
    return matrix_r, label_r


def activations_matrix_for_bigrams(t, h, samples_num, sequence_length, gibbs_steps, pattern):
    print 'Generating samples...',
    label_b = pattern + ' as bigram'
    letters_activations = {}
    freq = {}
    matrix_b = np.zeros((10, h)) ## discriminate first 10 positions
    tot_freq = np.zeros(10)
    for l in alphabet:
        letters_activations[l] = gpu.zeros(h)
        freq[l] = 0
    max_size = 10000
    generated = 0
    if samples_num < max_size: batch = samples_num
    else:                      batch = max_size
    while generated < samples_num:
        x, letters_activations, freq, matrix_b, tot_freq, track = t.W.get_samples_and_analyse_bigram(sequence_length, gibbs_steps, batch, dictionary = letters_activations, frequencies = freq, m = matrix_b, tot_freq = tot_freq, pattern = pattern)
        generated += batch
    print 'done.'
    for i in range(0, 10):
        if tot_freq[i] != 0:
            matrix_b[i] /= tot_freq[i]
##    print '\nWords involved: '
##    for j in track:
##        w = ''.join(words.decodify_word(x[j]))
##        print w
##    print 'Total words: ', len(track)
    #bas.show_complete_activations_matrix(values = matrix_b)
    return matrix_b, label_b


def plot_activations_matrices(h, matrix_p, label_p, matrix_c, label_c, matrix_r, label_r, matrix_b, label_b):
    ### before plotting, delete non-significant columns (i.e. neurons with always low activity)
    idx = []
    for n in range(h):
        threshold = 0.3
        column_p = np.nonzero(matrix_p[:, n] > threshold)
        column_c = np.nonzero(matrix_c[:, n] > threshold)
        column_r = np.nonzero(matrix_r[:, n] > threshold)
        column_b = np.nonzero(matrix_b[:, n] > threshold)
        if (len(column_p[0]) > 1) or (len(column_c[0]) > 1) or (len(column_r[0]) > 1) or (len(column_b[0]) > 1):
            idx.append(n)
    if idx != []:
        matrix_p = matrix_p[:, idx]
        matrix_c = matrix_c[:, idx]
        matrix_r = matrix_r[:, idx]
        matrix_b = matrix_b[:, idx]
        bas.show_reduced_activations_matrix(values = matrix_p, label = label_p)
        bas.show_reduced_activations_matrix(values = matrix_c, label = label_c)
        bas.show_reduced_activations_matrix(values = matrix_r, label = label_r)
        bas.show_reduced_activations_matrix(values = matrix_b, label = label_b)


## Not completed:

def check_stability_of_representations(t, h, v, gibbs_steps, context_tree):
    ## analyse stability of attractors (i.e. hidden representations) for training sequences
    ## NB: it works with BINARY visibles, calculating real-valued activations only
    ## in the final phase in order to generate corresponding words
    samples_num = 2000
    positions = 10
    unique_words = []
    ## first generate all the samples required
    print 'Generating samples...'
    V, V_stoch, H, H_stoch_init = t.W.get_samples_and_hiddens(positions, gibbs_steps, samples_num)
    print 'done.'
    V = gpu.as_numpy_array(V)
    V_stoch = gpu.as_numpy_array(V_stoch)
    H = gpu.as_numpy_array(H)
    H_stoch_init = gpu.as_numpy_array(H_stoch_init)
    ## than select only existing words between generated samples
    k = u = 0
    for i in range(samples_num):
        w = ''.join(words.decodify_word(V[i - k, :, :]))
        #if (context_tree.search(w) == False) or (w in unique_words):
        if w != 'set$' or (w in unique_words):
            H = np.delete(H, i - k, 0)
            H_stoch_init = np.delete(H_stoch_init, i - k, 0)
            V = np.delete(V, i - k, 0)
            V_stoch = np.delete(V_stoch, i - k, 0)
            k += 1
        else:
            unique_words.append(w)
            u += 1
    print 'Total unique matches: ', u, '\n'
    for i in range(u):
        w = ''.join(words.decodify_word(V[i, :, :]))
        print w
    print '\n'
    print 'Real-valued activations V:'
    for i in range(v):
        print '%.4f' % V[0, 2, i],
        if (i == 13):
            print ''
    print '\n'
    diversi_h = uguali_h = 0
    diversi_v = uguali_v = 0
    V_r_stoch = V_stoch
    H_r = H
    for k in range(1):
        ## NB: DO NOT TAKE ACTIVATIONS AFTER '$'!
        ## clamp visible units and let the network compute hidden activations
        H_r = t.W.reconstruct_hiddens(V_r_stoch)
        H_r = gpu.as_numpy_array(H_r)
        
##        ## compare hidden reconstructions with original hidden representations
##        hidden_diff = bas.compute_representations_similarity(M = H, M_r = H_r)
##        for k in range(u):
##            for j in range(positions):
##                for i in range(h):
##                    if np.abs(H[k, j, i] - H_r[k, j, i]) > 0.001:
##                        #print '%.3f\t%.3f' % (H[k, j, i], H_r[k, j, i])
##                        diversi_h += 1
##                    else:
##                        uguali_h += 1
##        #print 'H_r: ', float(diversi_h) / uguali_h, '\n'
##        ## plot?

        ## clamp hidden units and let the network compute visible activations
        V_r, V_r_stoch = t.W.reconstruct_visibles(H_r, H_stoch_init, gibbs_steps)
        #print '\nDiff: \n', V_stoch - V_r_stoch, '\n'
        
##        ## compare visible reconstructions with original visible values
##        visible_diff = bas.compute_representations_similarity(M = V_stoch, M_r = V_r_stoch)
##        for k in range(u):
##            for j in range(positions):
##                for i in range(v):
##                    if np.abs(V[k, j, i] - V_r[k, j, i]) > 0.001:
##                        #print '%.3f\t%.3f' % (V[k, j, i], V_r[k, j, i])
##                        diversi_v += 1
##                    else:
##                        uguali_v += 1
##        #print 'V_r: ', float(diversi_v) / uguali_v, '\n'
##        ## plot?
##
##        diversi_h = uguali_h = 0
##        diversi_v = uguali_v = 0
##
##        print '\nReal-valued activations V_r:'
##        for i in range(v):
##            print '%.4f' % V_r[0, 0, i],
##            if (i == 13):
##                print ''
##        print '\n'
        for i in range(u):
            w = ''.join(words.decodify_word(V_r[i, :, :]))
            print w
        print '\n'

