import numpy as np
from   collections import defaultdict
ii16 = np.iinfo(np.int16)

svm_format       = False
truncate         = False
n_last_letters   = 7    ## letters to consider
input_file       = 'r_data/patterns/3milioni/patterns_5300_h200.txt'
output_file      = 'r_data/classifier/5300/data_5300_all'
output_file_svm  = 'r_data/classifier/5300/data_5300_all_SVM'


f = open(input_file, 'r')
patterns     = []
labels       = []
labels_num   = []
labels_trunc = []
counter = 1
while True:
    l = f.readline()
    if not l:
        break
    word = l[0:-1]
    length = len(word)
    if (length >= n_last_letters) or not(truncate):
        word_trunc = word[-n_last_letters:]
        for p in range(0, length): ## last activations
            line = f.readline()
        pattern = np.fromstring(line, dtype=float, sep=' ')
        features = len(pattern)
        
        patterns.append(pattern)
        labels.append(word)
        labels_num.append(counter)
        if truncate:
            labels_trunc.append(word_trunc)
        counter += 1
        
    else:
        for p in range(0, length):
            line = f.readline()
    next_item = f.readline()
f.close()

labels     = np.array(labels)
labels_num = np.array(labels_num)
patterns   = np.array(patterns)

## create training file
f = open(output_file, 'w')
r, c = patterns.shape
f.write(str(r)+'\t'+str(c)+'\n')
for i in range(0, r):
    f.write(labels[i]+'\t'+str(int(labels_num[i]))+'\t'+str(int(len(labels[i])))+'\t')
    patterns[i].tofile(f, sep='\t')
    f.write('\n')
f.close()
if svm_format:
    f = open(output_file_svm, 'w')
    for i in range(0, r):
        f.write(str(int(labels_num[i]))+' ')
        for j in range(0, c):
            f.write(str(j+1)+':'+str(float(patterns[i, j]))+' ')
        f.write('\n')
f.close()
















