
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import emoji, string
import itertools
from nltk.tokenize import sent_tokenize, word_tokenize

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding
from sklearn.model_selection import KFold
from keras.layers import Dense, Input, Flatten, Reshape
from keras.layers import Conv1D, MaxPool1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, concatenate, add, Conv2D, SpatialDropout1D, MaxPooling2D
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, precision_recall_fscore_support
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MultiLabelBinarizer

import os, codecs
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

np.random.seed(100)

#from keras.backend.tensorflow_backend import set_session
#import tensorflow as tf
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
#config.log_device_placement = True
#set_session(tf.Session(config=config))

MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.2

#####################################################################################
#
# DATA CLEANING#
#####################################################################################

# emoticons
def load_dict_smileys():
    
    return {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
        }

# self defined contractions
def load_dict_contractions():
    
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "i'd":"i would",
        "i'll":"i will",
        "i'm":"i am",
        "i'm'a":"i am about to",
        "i'm'o":"i am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "i've":"i have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "whatcha":"what are you",
        "luv":"love",
        "sux":"sucks"
        }

def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def clean_str(string):

    string = re.sub(r'http\S+', '', string, flags=re.MULTILINE)
    string = re.sub(r'www.\S+', '', string, flags=re.MULTILINE)
#    string = re.sub(r'@\S+', '', string, flags=re.MULTILINE)
    string = re.sub(r"http\S+", "", string, flags=re.MULTILINE)
#    print(string)
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = string.lower()
    string = reduce_lengthening(string)
    
    #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
    CONTRACTIONS = load_dict_contractions()
    string = string.replace("’","'").replace("“","'").replace("”","'").replace("’","'").replace("‘","'")
    words = string.split()
    reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    string = " ".join(reformed)

    #Deal with emoticons source: https://en.wikipedia.org/wiki/List_of_emoticons
    SMILEY = load_dict_smileys()  
    words = string.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    string = " ".join(reformed)
    
    #Deal with emojis
    string = emoji.demojize(string)

    string = ' '.join(string.split())

    string = re.sub('[^A-Za-z0-9.?;!]+', ' ', string).lstrip().lower()
    string = string.replace(";"," ; ").replace("."," . ").replace("?"," ? ").replace("!"," ! ")

    string = ' '.join(string.split())    

#    load()
#    segs = segment(string)
#    print(segs)
#    string = ' '.join(segs)
#    print(string)

    return string.strip().lower()

class AttLayer(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)
        
        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
        
import cv2 
from skimage.transform import resize

prof_feat = np.load('profile_feats.npy', allow_pickle=True)

print(prof_feat.shape)  

for i in range(len(prof_feat)):
	prof_feat[i] = resize(prof_feat[i], (48, 48, 3)).astype(int)	

features = []

for i in range(len(prof_feat)):
    features.append(prof_feat[i]) 

features = np.array(features)
print(features.shape) 

nRows,nCols,nDims = features.shape[1:]
features = features.reshape(features.shape[0], nRows, nCols, nDims)
input_shape = (nRows, nCols, nDims)

pos_df = pd.read_csv('Dataset/pos_tweets.csv', sep=',', lineterminator='\n')
neg_df = pd.read_csv('Dataset/neg_tweets.csv', sep=',', lineterminator='\n')

print(pos_df)
print(neg_df)

pos_df = pos_df.dropna(subset=['user_description']) 
neg_df = neg_df.dropna(subset=['user_description']) 

pos_df['labels'] = 0
neg_df['labels'] = 1

print(pos_df)
print(neg_df)

df = [pos_df, neg_df]

train = pd.concat(df)

print(train)

train = train.reset_index()

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

from nltk import tokenize

textonly = []; descronly = []; labels = []

for idx in range(train.text.shape[0]):

    text = " ".join(text_processor.pre_process_doc(train.text[idx]))
    textonly.append(text)

    descr = " ".join(text_processor.pre_process_doc(train.user_description[idx]))
    descronly.append(descr)
    labels.append(train.labels[idx])

textonly = descronly
    
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(textonly)

textdata = np.zeros((len(textonly), MAX_SENT_LENGTH), dtype='int32')

for i, sent in enumerate(textonly):
    wordTokens = text_to_word_sequence(sent)
    k = 0
    for _, word in enumerate(wordTokens):
        if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
            textdata[i, k] = tokenizer.word_index[word]
            k = k + 1


word_index = tokenizer.word_index
print('\nTotal %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('\nShape of data tensor:', textdata.shape)
print('Shape of label tensor:', labels.shape)

lex_feats = np.load('Dataset/text_lexicons.np', allow_pickle=True)
print(lex_feats.shape)

from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as SIA
from sklearn.feature_extraction.text import TfidfVectorizer

afinn = Afinn()
vader = SIA()

afinn = np.array([[afinn.score(item)] for item in descronly])
vader = np.array([list(vader.polarity_scores(item).values()) for item in descronly])

lex_feats = np.c_[afinn, vader, lex_feats]

zero_indices = []

vect1 = TfidfVectorizer(analyzer = 'word', stop_words='english', ngram_range=(1,4), min_df=2, sublinear_tf=True,
                        max_features=1000)
vect2 = TfidfVectorizer(analyzer = 'char', stop_words='english', ngram_range=(2,4), min_df=4, sublinear_tf=True, 
                        max_features=2000)

bow1 = vect1.fit_transform(descronly).toarray()
bow2 = vect2.fit_transform(descronly).toarray()

bow_features = np.c_[bow1, bow2]

lex_feats = np.c_[bow_features, lex_feats]
#lex_feats = np.c_[bow_features, afinn, vader]
print(lex_feats.shape)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test, prof_feat_train, prof_feat_test, lex_feats_train, lex_feats_test = train_test_split(textdata, labels, features, lex_feats, test_size=0.2, random_state=42, stratify = labels)

in_shape = lex_feats_train.shape[1]
print(lex_feats_train.shape)
print(lex_feats_test.shape)

print("\nLoading GloVe model, this can take some time...\n...")
embeddings_index = {}
f = open('/sda/soumitra_1821cs05/glove.840B.300d.txt', encoding='utf-8')
#f = open('word2vec_cdotdata.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float')
        embeddings_index[word] = coefs
    except ValueError:
        continue
f.close()
print("Completed loading pretrained models.")

print('\nTotal %s word vectors.' % len(embeddings_index))

# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


target_names_dep = ['dep', 'non-dep']

embedding_layer = Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=False,
                            mask_zero=True)                                                       

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
lex_input = Input(shape=(in_shape,), dtype='float')

embedded_sequences = embedding_layer(sentence_input)
		
l_gru = Bidirectional(GRU(128, return_sequences=True))(embedded_sequences)
l_att_sent1 = AttLayer(100)(l_gru)
l_att_sent1 = Dropout(0.25)(l_att_sent1)

l_att_sent1 = Dense(100, activation='relu')(l_att_sent1)
l_att_sent1 = Dropout(0.25)(l_att_sent1)

		
out_emo = Dense(100, activation='relu')(concatenate([l_att_sent1, lex_input], axis=-1))
out_emo = Dropout(0.25)(out_emo)
#out_emo = Dense(100, activation='relu')(out_emo)
#out_emo = Dropout(0.25)(out_emo)	
out_emo = Dense(2, activation='softmax', name='out_emo')(out_emo)
        
model = Model([sentence_input, lex_input], out_emo)
	
	
model.compile(optimizer='adam', 
                   loss = {'out_emo':'categorical_crossentropy'},
                   metrics = {'out_emo':'accuracy'})

weight_file = 'weights/user_desc+lex.hdf5'
callback = [ModelCheckpoint(weight_file, monitor='val_acc', verbose=1, save_best_only=True, mode='max')]
	
K.set_value(model.optimizer.lr, 1e-3)
model.fit([x_train, lex_feats_train], y_train, batch_size=64, epochs=10, shuffle=True, validation_split=0.1, verbose=1, callbacks=callback)
model.load_weights(weight_file)
model.save('models/user_desc+lex.h5')

f = open('results/user_desc+lex.txt', 'w')


#Depression  Metrics
a = model.predict([x_test,lex_feats_test])
pred1_test = np.argmax(a, axis=1)
classes_test = np.argmax(to_categorical(y_test), axis=1)[:,1]
predictions1_test=np.array(list(pred1_test))
print('\nTest Accuracy: {}\n'.format(accuracy_score(classes_test,  predictions1_test)),file=f)
print('\nTest Accuracy: {}\n'.format(accuracy_score(classes_test,  predictions1_test)))
print('Test P_R_F score: {}\n'.format(precision_recall_fscore_support(classes_test,  predictions1_test, average='macro')),file=f)
print('Test P_R_F score: {}\n'.format(precision_recall_fscore_support(classes_test,  predictions1_test, average='macro')))
print('Confusion Matrix:\n'.format(confusion_matrix(classes_test,  predictions1_test)),file=f)
print('Classificaton Report of LSTM:\n',classification_report(classes_test, predictions1_test, target_names=target_names_dep),file=f)
print('Classificaton Report of LSTM:\n',classification_report(classes_test, predictions1_test, target_names=target_names_dep))
cm1 = confusion_matrix(classes_test, predictions1_test)
print('Confusion matrix of LSTM:\n', cm1,file=f)
print('Confusion matrix of LSTM:\n', cm1)
