
### Procedures to parse the dataset file and extract the desired number of words
### that will be used as training set.
### This modules also includes functions used to codify and decodify sequences
### of letters into the matrix format required by the RTRBM.

from   pylab  import *
from   random import choice
import numpy  as np
import gnumpy as gpu

SIZE = 27         ## size of the coding for each letter

## number of words for each length: must be a multiple of minibatch_size
## Tot         : 7350 (with possible repetitions)
## Used        : 6670 (100%)
## Training set: 5300 ( 80%)
## Test set    : 1370 ( 20%)
tot_num_3  = 450  # over 587
test_num_3 = 120
tot_num_4  = 1300 # over 1955
test_num_4 = 325
tot_num_5  = 1800 # over 2443
test_num_5 = 480
tot_num_6  = 1300 # over 1630
test_num_6 = 325
tot_num_7  = 450  # over 617
test_num_7 = 120
tot_num_8  = 0    # over 91


def codify_word(word, code_type = 'symbolic'):

    if code_type == 'symbolic':
        ### Create a matrix which contains in each row the corrensponding
        ### letter's code (orthogonal, i.e. all zeros but one)
        n_letters = len(word)
        x = zeros((n_letters + 1, SIZE))
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        for i in range(0, n_letters):
            x[i, np.where(alphabet == word[i])[0]] = 1
        x[n_letters, SIZE - 1] = 1 ### end of the sequence

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return x

def codify_word_2(word, max_letters, code_type = 'symbolic'):

    if code_type == 'symbolic':
        ### Create a matrix which contains in each row the corrensponding
        ### letter's code (orthogonal, i.e. all zeros but one)
        n_letters = len(word)
        x = zeros((max_letters + 1, SIZE))
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        for i in range(0, n_letters):
            x[i, np.where(alphabet == word[i])[0]] = 1
        x[n_letters, SIZE - 1] = 1 ### end of the sequence

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return x


def codify_letter(letter, code_type = 'symbolic'):

    if code_type == 'symbolic':
        ### Create a vector which contains the corrensponding
        ### letter's code (orthogonal, i.e. all zeros but one)
        x = zeros(SIZE)
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        x[np.where(alphabet == letter)] = 1

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return x


def decodify_word(seq, code_type = 'symbolic'):

    if code_type == 'symbolic':
        n_letters, v = seq.shape
        word = []
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        for i in range(0, n_letters):
##            prob = gpu.as_numpy_array(seq[i])
##            total = prob.sum()
##            norm_prob = prob / total
##            roulette = [sum(norm_prob[:j+1]) for j in range(len(norm_prob))]
##            r = np.random.rand()
##            for k in range(0, 27):
##                if r <= roulette[k]:
##                    last_elem = k
##                    word.append(alphabet[last_elem])
##                    break
##            if last_elem == 26:
##                break

            letter_idx = np.argmax(seq[i])
            ## minimum level of activity required
##            if (seq[i, letter_idx] > 0.001):
            if letter_idx == SIZE - 1:
                word.append(alphabet[letter_idx])
                break
            else:
                word.append(alphabet[letter_idx])
##            else:
##                #word.append(alphabet[SIZE - 1])
##                word = []
##                word.append('?')
##                break

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return word


def decodify_letter(seq, code_type = 'symbolic'):

    if code_type == 'symbolic':
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        letter_idx = np.argmax(seq)
        letter = alphabet[letter_idx]

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return letter


def extract_words(filename = '../../data/Spokefre2'):
    test_set = []
    f = open(filename, 'r')
    training_3 = []
    training_4 = []
    training_5 = []
    training_6 = []
    training_7 = []
    test_3 = []
    test_4 = []
    test_5 = []
    test_6 = []
    test_7 = []
    for line in f:
        ### separate each of the four elements in a row
        elements = line.split('\t')
        ### select only words of pre-defined length
        if (len(elements[0]) == 3):
            training_3.append((int(elements[2]), elements[0]))
        elif (len(elements[0]) == 4):
            training_4.append((int(elements[2]), elements[0]))
        elif (len(elements[0]) == 5):
            training_5.append((int(elements[2]), elements[0]))
        elif (len(elements[0]) == 6):
            training_6.append((int(elements[2]), elements[0]))
        elif (len(elements[0]) == 7):
            training_7.append((int(elements[2]), elements[0]))

    ## from each training set, remove a random test set
    training_3 = unique(training_3)
    training_3 = list(training_3[-(tot_num_3 + test_num_3):, 1])
    for i in range(test_num_3):
        sampled = choice(training_3)
        test_3.append(sampled)
        training_3.remove(sampled)

    training_4 = unique(training_4)
    training_4 = list(training_4[-(tot_num_4 + test_num_4):, 1])
    for i in range(test_num_4):
        sampled = choice(training_4)
        test_4.append(sampled)
        training_4.remove(sampled)

    training_5 = unique(training_5)
    training_5 = list(training_5[-(tot_num_5 + test_num_5):, 1])
    for i in range(test_num_5):
        sampled = choice(training_5)
        test_5.append(sampled)
        training_5.remove(sampled)
    
    training_6 = unique(training_6)
    training_6 = list(training_6[-(tot_num_6 + test_num_6):, 1])
    for i in range(test_num_6):
        sampled = choice(training_6)
        test_6.append(sampled)
        training_6.remove(sampled)
    
    training_7 = unique(training_7)
    training_7 = list(training_7[-(tot_num_7 + test_num_7):, 1])
    for i in range(test_num_7):
        sampled = choice(training_7)
        test_7.append(sampled)
        training_7.remove(sampled)
    
    training_set = np.concatenate((training_3, training_4, training_5, training_6, training_7))
    test_set = np.concatenate((test_3, test_4, test_5, test_6, test_7))
    lengths = []
    lengths.append(str(len(training_3)))
    lengths.append(str(len(training_4)))
    lengths.append(str(len(training_5)))
    lengths.append(str(len(training_6)))
    lengths.append(str(len(training_7)))
    
    return training_set, test_set, lengths


def extract_words_test(filename = '../../data/Spokefre2', context_tree = None):
    f = open(filename, 'r')
    v = []
    for line in f:
        ### separate each of the four elements in a row
        elements = line.split('\t')
        ### exclude words used for training
        w = elements[0] + '$'
        if (len(elements[0]) > 2) and (context_tree.search(w) == False):
            v.append(elements[0])
    v = unique(v)
    
    return v

def extract_artificial_test():
    f = open('TR.txt')
    test_set = []
    bigram_list = []
    line = f.readline()
    f.close()
    training_set = line.split('\t')
    for w in training_set:
        if len(w) == 3:
            bigram_list.append((w[0],w[1]))
            bigram_list.append((w[1],w[2]))
        elif len(w) == 4:
            bigram_list.append((w[0],w[1]))
            bigram_list.append((w[1],w[2]))
            bigram_list.append((w[2],w[3]))
        elif len(w) == 5:
            bigram_list.append((w[0],w[1]))
            bigram_list.append((w[1],w[2]))
            bigram_list.append((w[2],w[3]))
            bigram_list.append((w[3],w[4]))

    w = 0
    while w < 100:
        first = choice(bigram_list)
        sentinel = True
        k = 0
        while sentinel and k < 10000:
            second = choice(bigram_list)
            if second != first and second[0] == first[1]:
                sentinel = False
                found = True
            k += 1
        if found == True:
            word = first[0] + first[1] + second[1]
            if word not in training_set:
                test_set.append(word)
                w += 1
    w = 0
    while w < 100:
        first = choice(bigram_list)
        sentinel = True
        k = 0
        while sentinel and k < 10000:
            second = choice(bigram_list)
            if second != first and second[0] == first[1]:
                sentinel = False
                #found = True
            k += 1
        sentinel = True
        k = 0
        while sentinel and k < 10000:
            third = choice(bigram_list)
            if third != second and third[0] == second[1]:
                sentinel = False
                found = True
            k += 1
        if found == True:
            word = first[0] + first[1] + second[1] + third[1]
            if word not in training_set:
                test_set.append(word)
                w += 1
    w = 0
    while w < 100:
        first = choice(bigram_list)
        sentinel = True
        k = 0
        while sentinel and k < 10000:
            second = choice(bigram_list)
            if second != first and second[0] == first[1]:
                sentinel = False
                #found = True
            k += 1
        sentinel = True
        k = 0
        while sentinel and k < 10000:
            third = choice(bigram_list)
            if third != second and third[0] == second[1]:
                sentinel = False
                #found = True
            k += 1
        sentinel = True
        k = 0
        while sentinel and k < 10000:
            fourth = choice(bigram_list)
            if fourth != third and fourth[0] == third[1]:
                sentinel = False
                found = True
            k += 1
        if found == True:
            word = first[0] + first[1] + second[1] + third[1] + fourth[1]
            if word not in training_set:
                test_set.append(word)
                w += 1
    f = open('TE.txt', 'w')
    f.write('\t'.join(test_set))
    f.close()
    
        


#training_set, test_set, l = extract_words('Spokefre2')
#extract_artificial_test()



##all_big = []
##f = open('Spokefre2', 'r')
##for line in f:
##    elements = line.split('\t')
##    w = elements[0]
##    all_big.append(w[0]+w[1])
##    for i in range(1, len(w) - 1):
##        all_big.append(w[i]+w[i+1])
##    
##all_big = set(all_big)
###print all_big
##print len(all_big)
##
##count = 0
##s = open('samples.txt', 'r')
##for line in s:
##    sentinel = False
##    elements = line.split('\t')
##    w = elements[0]
##    if len(w) > 1:
##        big1 = w[0]+w[1]
##        if big1 not in all_big:
##            #print big1,
##            sentinel = True
##            #print w,
##            count += int(elements[1])
##            #print int(elements[1]),
##        for i in range(1, len(w) - 2):
##            if not sentinel:
##                big = w[i] + w[i+1]
##                if big not in all_big:
##                    #print big,
##                    sentinel = True
##                    #print w,
##                    count += int(elements[1])
##                    #print int(elements[1]),
##
##print '\n', count
















