
### Procedures to parse the dataset file and extract the desired number of words
### that will be used as training set.
### This modules also includes functions used to codify and decodify sequences
### of letters into the matrix format required by the RTRBM.

from   pylab  import *
from   random import choice
import numpy  as np
import gnumpy as gpu

SIZE = 27         ## size of the coding for each letter

## number of words for each length: must be a multiple of minibatch_size
## Tot         : 7350 (with possible repetitions)
## Used        : 6670 (100%)
## Training set: 5300 ( 80%)
## Test set    : 1370 ( 20%)
tot_num_3  = 450  # over 587
test_num_3 = 120
tot_num_4  = 1300 # over 1955
test_num_4 = 325
tot_num_5  = 1800 # over 2443
test_num_5 = 480
tot_num_6  = 1300 # over 1630
test_num_6 = 325
tot_num_7  = 450  # over 617
test_num_7 = 120
tot_num_8  = 0    # over 91


def codify_word(word, code_type = 'symbolic'):

    if code_type == 'symbolic':
        ### Create a matrix which contains in each row the corrensponding
        ### letter's code (orthogonal, i.e. all zeros but one)
        n_letters = len(word)
        x = zeros((n_letters + 1, SIZE))
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        for i in range(0, n_letters):
            x[i, np.where(alphabet == word[i])[0]] = 1
        x[n_letters, SIZE - 1] = 1 ### end of the sequence

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return x


def codify_letter(letter, code_type = 'symbolic'):

    if code_type == 'symbolic':
        ### Create a vector which contains the corrensponding
        ### letter's code (orthogonal, i.e. all zeros but one)
        x = zeros(SIZE)
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        x[np.where(alphabet == letter)] = 1

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return x


def decodify_word(seq, code_type = 'symbolic'):

    if code_type == 'symbolic':
        n_letters, v = seq.shape
        word = []
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        for i in range(0, n_letters):
            letter_idx = np.argmax(seq[i])
            ## minimum level of activity required
            if (seq[i, letter_idx] > 0.4):
                if letter_idx == SIZE - 1:
                    word.append(alphabet[letter_idx])
                    break
                else:
                    word.append(alphabet[letter_idx])
            else:
                word.append(alphabet[SIZE - 1])
                break

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return word


def decodify_letter(seq, code_type = 'symbolic'):

    if code_type == 'symbolic':
        alphabet = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$'])
        letter_idx = np.argmax(seq)
        letter = alphabet[letter_idx]

    elif code_type == 'features':
        raise Exception('Code not implemented yet.')
    else:
        raise Exception('Unknown code type.')
    return letter


def extract_words(filename = '../../data/Spokefre2'):
    test_set = []
    f = open(filename, 'r')
    vocab_3 = []
    vocab_4 = []
    vocab_5 = []
    vocab_6 = []
    vocab_7 = []
    training_3 = []
    training_4 = []
    training_5 = []
    training_6 = []
    training_7 = []
    test_3 = []
    test_4 = []
    test_5 = []
    test_6 = []
    test_7 = []
    for line in f:
        ### separate each of the four elements in a row
        elements = line.split('\t')
        ### select only words of pre-defined length
        if (len(elements[0]) == 3):
            training_3.append((int(elements[2]), elements[0]))
        elif (len(elements[0]) == 4):
            vocab_4.append(elements[0])
        elif (len(elements[0]) == 5):
            vocab_5.append(elements[0])
        elif (len(elements[0]) == 6):
            vocab_6.append(elements[0])
        elif (len(elements[0]) == 7):
            vocab_7.append(elements[0])

    ## from each training set, remove a random test set
    training_3 = unique(training_3)
    training_3 = list(training_3[-(tot_num_3 + test_num_3):, 1])
    for i in range(test_num_3):
        sampled = choice(training_3)
        test_3.append(sampled)
        training_3.remove(sampled)
    print len(training_3)
    print len(test_3)
    print training_3, '\n'
    print test_3

    vocab_4 = list(unique(vocab_4))
    for i in range(tot_num_4):
        sampled = choice(vocab_4)
        training_4.append(sampled)
        vocab_4.remove(sampled)
    for i in range(test_num_4):
        sampled = choice(vocab_4)
        test_4.append(sampled)
        vocab_4.remove(sampled)

    vocab_5 = list(unique(vocab_5))
    for i in range(tot_num_5):
        sampled = choice(vocab_5)
        training_5.append(sampled)
        vocab_5.remove(sampled)
    for i in range(test_num_5):
        sampled = choice(vocab_5)
        test_5.append(sampled)
        vocab_5.remove(sampled)

    vocab_6 = list(unique(vocab_6))
    for i in range(tot_num_6):
        sampled = choice(vocab_6)
        training_6.append(sampled)
        vocab_6.remove(sampled)
    for i in range(test_num_6):
        sampled = choice(vocab_6)
        test_6.append(sampled)
        vocab_6.remove(sampled)

    vocab_7 = list(unique(vocab_7))
    for i in range(tot_num_7):
        sampled = choice(vocab_7)
        training_7.append(sampled)
        vocab_7.remove(sampled)
    for i in range(test_num_7):
        sampled = choice(vocab_7)
        test_7.append(sampled)
        vocab_7.remove(sampled)
    
    training_set = np.concatenate((training_3, training_4, training_5, training_6, training_7))
    test_set = np.concatenate((test_3, test_4, test_5, test_6, test_7))
    lengths = []
    lengths.append(str(len(training_3)))
    lengths.append(str(len(training_4)))
    lengths.append(str(len(training_5)))
    lengths.append(str(len(training_6)))
    lengths.append(str(len(training_7)))
    
    return training_set, test_set, lengths


def extract_words_test(filename = '../../data/Spokefre2', context_tree = None):
    f = open(filename, 'r')
    v = []
    for line in f:
        ### separate each of the four elements in a row
        elements = line.split('\t')
        ### exclude words used for training
        w = elements[0] + '$'
        if (len(elements[0]) > 2) and (context_tree.search(w) == False):
            v.append(elements[0])
    v = unique(v)
    
    return v




training_set, test_set, l = extract_words('Spokefre2')























