
### Procedures to parse the dataset and create an alternative format file
### which will be used to train a linear dynamical system in order to estimate
### the minimum number of hidden units required by the RTRBM.

from pylab import *
import numpy  as np

SIZE = 27         ## size of the code for each letter

## number of words for each length: must be a multiple of minibatch size
tot_num_3  = 550
tot_num_4  = 1950
tot_num_5  = 2400

def extract_words(filename):
    f = open(filename, 'r')
    vocab_3 = []
    vocab_4 = []
    vocab_5 = []
    for line in f:
        ### separate each of the four elements in a row
        elements = line.split('\t')
        ### select only words of pre-defined length
        if (len(elements[0]) == 3):
            vocab_3.append((int(elements[2]), elements[0]))
        elif (len(elements[0]) == 4):
            vocab_4.append((int(elements[2]), elements[0]))
        elif (len(elements[0]) == 5):
            vocab_5.append((int(elements[2]), elements[0]))

    vocab_3 = unique(vocab_3)
    vocab_3 = vocab_3[-tot_num_3:, 1]
    vocab_4 = unique(vocab_4)
    vocab_4 = vocab_4[-tot_num_4:, 1]
    vocab_5 = unique(vocab_5)
    vocab_5 = vocab_5[-tot_num_5:, 1]
    v = np.concatenate((vocab_3, vocab_4, vocab_5))
    
    return v, len(vocab_3), len(vocab_4), len(vocab_5)


seq, a, b, c = extract_words('Spokefre2')
f = open('training_set_words', 'w')
for word in seq:
    for letter in word:
        f.write(letter + '(')
    f.write('$')
    print word
    for i in word:
        f.write(')')
    f.write('\n')
f.close()














            
