import numpy as np
import math

#Global Variables
vocab = ['S', 'T', 'Y']
amino_acid = ['G', 'A', 'V', 'L', 'I', 'M', 'F', 'W', 'P', 'S', 'T', 'Y', 'C', 'N', 'Q', 'D', 'E', 'K', 'R', 'H']
feature_size, context_window = 69, 10
X = list()

#Converts the given sequence to a dataset using the features provided
def converter(seq, feats):
    amino_acid_count = dict()
    for i in amino_acid:
        amino_acid_count[i] = seq.count(i)+1
    feats = [int(i) for i in feats.strip().split()]

    for i in range(len(seq)):
        if seq[i] in vocab:
            start, end = (i-context_window)*feature_size, (i+1+context_window)*feature_size
            amino_acid_window = seq[0:int((end+(0-start))/feature_size)]
            score = [amino_acid_window.count(k)/amino_acid_count[k] for k in amino_acid]
            if start < 0:
                #Dynamic Window
                X.append(feats[0:end+(0-start)]+score+[len(seq)-i]+[i])
            elif end > len(feats):
                X.append(feats[start-(end-len(feats)):len(feats)]+score+[len(seq)-i]+[i])
            else:
                X.append(feats[start:end]+score+[len(seq)-i]+[i])
    
    return X
