import pandas as pd
import numpy as np
import Levenshtein as lev
from collections import defaultdict
import py_stringmatching as sm
import math
import time
import random
import csv 
import re
from csv import reader
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
import urllib.request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform
import os, sys

results = pd.read_csv("/var/www/html/smatrix.csv")

#data = results.sample(n=100, axis='columns')

plt.figure()
correlations = results.corr()
sns.heatmap(round(correlations,2), cmap='RdBu', annot=False, 
            annot_kws={"size": 7}, vmin=-1, vmax=1)


correlations.to_csv("/var/www/html/cmatrix.csv")
plt.savefig('/var/www/html/images/heatmap1.png')

dissimilarity = 1 - abs(correlations)
Z = linkage(squareform(dissimilarity), 'complete')

# Clusterize the data
threshold = int(input("Enter number of clusters: "))
labels = fcluster(Z, threshold, criterion='maxclust')
#print(labels)
out = pd.DataFrame(columns = ['url', 'cluster group'])
urlList = list(results.columns)
print(len(urlList))
for i in range(len(urlList)):
    temp = pd.DataFrame(np.array([[urlList[i], labels[i]]]), columns = ['url', 'cluster group'])
    out = out.append(temp)
fileName = "cluster" + str(threshold) + ".csv"
out.to_csv(fileName)
# Keep the indices to sort labels
labels_order = np.argsort(labels)


# Build a new dataframe with the sorted columns
for idx, i in enumerate(results.columns[labels_order]):
    if idx == 0:
        clustered = pd.DataFrame(results[i])
    else:
        df_to_append = pd.DataFrame(results[i])
        clustered = pd.concat([clustered, df_to_append], axis=1)

plt.figure()
correlations = clustered.corr()
sns.heatmap(round(correlations,2), cmap='RdBu', annot=False, 
            annot_kws={"size": 7}, vmin=-1, vmax=1)

plt.savefig('/var/www/html/images/heatmap2.png')
results.insert(0, '', urlList)
results.to_csv("/var/www/html/smatrix.csv", index = False)

options = Options()
options.add_argument('--no-sandbox')
options.add_argument("--disable-setuid-sandbox")
options.add_argument("--headless")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-browser-side-navigation")
options.add_argument("--disable-gpu")
options.add_argument("-disable-features=VizDisplayCompositor")
options.add_argument("window-size=1920,20000")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

urlinput = input("Please enter your businessess website (URL): ")
#urlfile = open("/home/ec2-user/testbs4/url.txt", 'r')
#urlarr = urlfile.readlines()
#urlinput = urlarr[0].rstrip("\n")

while "http" not in urlinput:
    urlinput = input("Error. Please enter your businessess website (URL): ")


driver.get("http://www.exadium.com/tools/metadata/")
elem = driver.find_element_by_name("Urls")
elem.clear()
elem.send_keys(urlinput)

keywordbox = driver.find_element_by_xpath('//*[@id="Form_AnalyseMetaForm_ShowKeywords"]')
keywordbox.click()
button = driver.find_element_by_name("action_CreateAnalyseMeta")
button.click()
time.sleep(5)

usertitle = driver.find_element_by_xpath('/html/body/div/div[2]/div/div[2]/div/table/tbody/tr[2]/td[2]')
userdescription = driver.find_element_by_xpath('/html/body/div/div[2]/div/div[2]/div/table/tbody/tr[2]/td[3]')
userkeyword = driver.find_element_by_xpath('/html/body/div/div[2]/div/div[2]/div/table/tbody/tr[2]/td[4]')

usertitle2 = usertitle.text
print(usertitle2)
userdescription2 = userdescription.text
print(userdescription2)
userkeyword2 = userkeyword.text
print(userkeyword2)

listOfDesc = []
info = pd.read_csv('rawdata.csv', usecols = ['snippet'])

for desc in info['snippet']:
    tempList = desc.split(" ")
    for i in range(len(tempList)):
        tempList[i]= tempList[i].lower()
    #tempList.append(desc)
    listOfDesc.append(tempList)

for i in range(len(listOfDesc)):
    listOfDesc[i] = set(listOfDesc[i])
    listOfDesc[i] = list(listOfDesc[i])
    #print(listOfDesc[i])

done = []
newListOfDesc = []
for i in listOfDesc:
    emptyList = []
    for word in i:
        if word not in done:
            done.append(word)
            emptyList.append(word)
    if emptyList:
        newListOfDesc.append(emptyList)

delList = []
for i in range(len(newListOfDesc)):
    for j in range(len(newListOfDesc[i])):
        if (any(not c.isalnum() for c in newListOfDesc[i][j])):
            delList.append((i,j))

delList.sort(reverse=True)

for i in range(len(delList)):
    print("Deleting number " + str(i))
    del newListOfDesc[int(delList[i][0])][int(delList[i][1])]


print(newListOfDesc)







#exec(open("dbconnect.py").read())
import mysql.connector
from mysql.connector import Error

try:
    cnx = mysql.connector.connect(user='admin', password='Gag85qim',host='seniorproject.ckcdq1rpmhlp.us-east-2.rds.amazonaws.com', database='elitefour')
    
    if cnx.is_connected():
        cursor = cnx.cursor()

    # drop previous KeywordInfo table
    cursor.execute("DROP TABLE IF EXISTS URLInfo")

    cursor.execute("DROP TABLE IF EXISTS Metadata")
    cursor.execute("""CREATE TABLE Metadata (keyword varchar(1000000000), hasDesc int)""")
    add_data2 = ("""INSERT INTO elitefour.Metadata
                    (keyword, hasDesc)
                    VALUES(%s, %s)""")
    cursor.execute(add_data2, (stringOfWords, desc))
    cnx.commit()


    # close cursor and connection
    cursor.close()
    cnx.close()

except Error as e:
    print ("Error while connecting to MySQL:", e)

