import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
import nltk
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import re
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import statsmodels.api as sm
import statsmodels.formula.api as smf # importing stats model api for multiple regression
These are words that do not themselves have significant meaning for polarity purposes.
# Download stop words
nltk.download('stopwords')
#example of stopwords
sw = stopwords.words('english')
sw[:5]
def stopwords(text):
'''a function for removing the stopword'''
#removing the stop words and lowercasing the selected words
#text = [str(word)for word in text if type(word) == float]
text = [word.lower() for word in str(text).split() if word.lower() not in sw]
#joining the list of words with space seperator
return ' '.join(text)
def remove_emoji(string):
"""a function to remove emojis"""
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
# Reading in Raw Data
data_rough = pd.read_csv('BobChat Conversations - All Data.csv')
data = data_rough.copy()
data['Message'] = data['Message'].str.replace(',', '') # Remove Commas
data['Message'] = data['Message'].str.replace('.', '') # Remove Periods
data['Message'] = data['Message'].str.replace('"', '') # Removes Quotes
data['Message'] = data['Message'].str.replace(';', '') # Remove Semicolon
data['Message'] = data['Message'].str.replace(':', '') # Remove Colon
data['Message'] = data['Message'].str.replace('?', '') # Remove Question mark
data['Message'] = data['Message'].str.replace('&', '') # Remove And Symbol
data['Message'] = data['Message'].str.replace('!', '') # Remove Exclimation Point
data['Message'] = data['Message'].str.replace('-', ' ') # Remove Dash Point
data['Message'] = data['Message'].str.replace('_', ' ') # Remove Underscore Point
sns.distplot(data_rough['Message'].str.len(),
rug=True,
bins=np.arange(0, 70,4),
axlabel="Number of Characters (By Four)", color = "gold"
)
sns.set(rc={'figure.figsize':(16,16)})
sns.set(font_scale = 2)
plt.xlim(0, 70)
plt.xticks(np.arange(0,70,4))
plt.gca().set_ylabel('Proportion per character')
plt.title("Before Cleaning Character Proportion");
# Applying remove stop words function to messages to remove stopwords
data['Message'] = data['Message'].apply(stopwords)
# Applying remove_emogi function to messages to remove emojis
data['Message'] = data['Message'].apply(remove_emoji)
sns.distplot(data['Message'].str.len(),
rug=True,
bins=np.arange(0, 70,4),
axlabel="Number of Characters (By Four Characters)"
)
sns.set(rc={'figure.figsize':(14,14)})
sns.set(font_scale = 2)
plt.xlim(0, 70)
plt.xticks(np.arange(0,70,4))
plt.gca().set_ylabel('Proportion per character')
plt.title("Post Cleaning Character Proportion");
sns.distplot(data_rough['Message'].str.len(),
rug=True,
bins=np.arange(0, 70,4),
axlabel="Number of Characters (By Four)", color = "yellow"
)
sns.distplot(data['Message'].str.len(),
rug=True,
bins=np.arange(0, 70,4),
axlabel="Number of Characters (By Four Characters)"
)
sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale = 2)
plt.xlim(0, 70)
plt.legend(labels = ["Pre-cleaning", "Post-cleaning"])
plt.xticks(np.arange(0,70,4))
plt.gca().set_ylabel('Proportion per character')
plt.title("Pre vs Post Cleaning Character Proportion")
data['Timestamp']= pd.to_datetime(data['Timestamp'])
print(f"There are {data.shape[0]} Rows and {data.shape[1]} Columns currently in the Data Set")
# Applying a fucntion to fucntion to count the number of words in a message post cleaning
data["word_count"] = data['Message'].apply(lambda x: len(str(x).split(" ")))
#['weekday'].replace({0: 'Sun', 1 : 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat'}, inplace = True)
#data['weekeday'] = data[data['Timestamp']].weekday()
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data['day_of_week'] = data['Timestamp'].dt.day_name()
data['hour'] = round((data["Timestamp"].dt.hour) + (data["Timestamp"].dt.minute)/60 + (data["Timestamp"].dt.second)/60**2)
Using sort we can find the most common words used in our dataset post cleaning.
# Splitting the string type messages, then counting them
top_words = data['Message'].str.split(expand = True).stack().value_counts()
top_20 = top_words[:20]
ax = top_20.plot(kind ='bar', figsize = (15,10), rot = 60)
ax.set_alpha(.20)
ax.set_title("Most common words in the Bobchat Dataset", fontsize = 20)
ax.set_ylabel("Count")
ax.set_xlabel("Words")
#create a list to collect the plt.patches data
totals = []
#find the values and append to list
for i in ax.patches:
totals.append(i.get_height())
#set individual bar lables using above list
total = sum(totals)
#set individual bar lables using above list
for i in ax.patches:
#get_x pulls left or right ; get_height pushes up or down
ax.text(i.get_x()-.06, i.get_height()+.06, \
str(round((i.get_height()/total)*100,1))+'%', fontsize=15, color = 'black')
def find_ngrams(input_list, n):
'''a function for joining a number of words together'''
return list(zip(*[input_list[i:] for i in range(n)]))
from collections import Counter
from itertools import chain
pairs = data['Message'].map(lambda x: find_ngrams(x.split(' '), 2)).to_frame()
pairs
new_pairs = []
for p in pairs["Message"]:
if p != []:
new_pairs.append(p)
drop_word = ''
newest_pairs = []
for pair in new_pairs:
if drop_word in chain(*pair):
continue
else:
newest_pairs.append(pair)
newest_pairs
# Bigram Frequency Counts
bigrams = newest_pairs
bigrams = list(chain(*bigrams))
bigrams = [(x.lower(), y.lower()) for x,y in bigrams]
bigram_counts = Counter(bigrams)
bigram_counts = bigram_counts.most_common()
top20_bigram = bigram_counts[:20]
keys = [b[0] for b in top20_bigram]
values = [b[1] for b in top20_bigram]
x = []
for t in keys:
x.append(" ".join(t))
x
plt.bar(x, values)
sns.set(rc={'figure.figsize':(18,10)})
sns.set(font_scale = 2)
plt.xticks(range(20),rotation=75)
plt.xlabel('Bigrams', fontsize=30)
plt.ylabel('Count', fontsize=30)
plt.title("Bobchat: Most common appearance of words together", fontsize=30)
plt.show()
series_hour = data['hour'].sort_values()
sns.distplot(series_hour, hist=True, color = "red")
sns.set(rc={'figure.figsize':(15,10)})
sns.set(font_scale = 2)
plt.xlabel('Hour', fontsize=30)
plt.ylabel('Proportion', fontsize=30)
plt.title("Bobchat: Most Active Hour For Conversation", fontsize=30)
plt.show()
;
#message_hour_group = data[["Message", "hour"]].groupby("hour")
#message_hour_group.str.split(expand = True).stack().value_counts().split(expand = True).stack().value_counts()
data["Date"]= [d.date() for d in data["Timestamp"]]
date_message_count = data[["Message", "Date"]].groupby("Date").count()
date_message_count.reset_index(inplace=True)
date_message_count = date_message_count[(date_message_count["Date"] <= datetime.date(2019,3,30)) & (date_message_count["Date"] >= datetime.date(2018,12,1))]
date_message_count["Date"].min()
sns.lineplot(date_message_count["Date"], date_message_count["Message"] )
#sns.plot(date_message_count["Date"], date_message_count["Message"], kind ="bar")
sns.set(font_scale = 2)
plt.xlabel('Date', fontsize=30)
plt.xticks(rotation=45)
plt.ylabel('Message Count', fontsize=30)
plt.title("Bobchat: Conversation Over Time", fontsize=30)
plt.show()
;
series_day = data.groupby(["day_of_week"]).count()
x = ['Sunday','Monday','Tuesday','Wednesday', 'Thursday', 'Friday','Saturday']
ax = sns.barplot(x, series_day['Message'])
sns.set(rc={'figure.figsize':(15,10)})
sns.set(font_scale = 2)
plt.xlabel('Day of the week', fontsize = 20)
plt.ylabel('Proportion of Messages', fontsize = 20)
plt.title("Bobchat: Most Active Day of the week For Conversation", fontsize = 30)
#create a list to collect the plt.patches data
totals = []
#find the values and append to list
for i in ax.patches:
totals.append(i.get_height())
#set individual bar lables using above list
total = sum(totals)
#set individual bar lables using above list
for i in ax.patches:
#get_x pulls left or right ; get_height pushes up or down
ax.text(i.get_x()+ .2, i.get_height()+.1, \
str(round((i.get_height()/total)*100,1))+'%', fontsize=20, color = 'black')
plt.show()
We will use the VADER (Valence Aware Dictionary and Sentiment Reasoner) lexicon to analyze the sentiment of Trump's tweets. VADER is a lexicon and rule-based sentiment analysis tool that is specifically useful for sentiments in social media. The VADER lexicon gives the sentiment of individual words.
# Getting Vader Data Set
url = 'https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt'
vader = pd.read_csv(url, sep = '\t', header = None )
#Renaming Columns
vader = vader.rename(columns = {1:'polarity', 0 : 'token', 2:'standard dev', 3: 'raw scores'})
#Indexing by Token
vader = vader.set_index('token')
# Creating an Id column
data['id'] = list(range(len(data)))
#Setting Index to Id
data = data.set_index('id')
# Subset of data
# Splitting Messages and explanding them so each word is a column
message_split = data['Message'].str.split(expand = True)
# Inverting the Subset
numbered_columns = message_split.columns.values
# resetting index, removing it from id
message_split.reset_index(inplace = True)
# Creating Tidy format by merging the numbered columns with the Message split table
tidy_format = pd.melt(message_split, id_vars=['id'], value_vars=numbered_columns)
# Renaming Columsn for join to match vader set
tidy_format = tidy_format.rename(columns = {'variable' : 'num', 'value' : 'word'})
tidy_format = tidy_format.dropna()
tidy_format = tidy_format.sort_values(['id', 'num'])
# joining vader and Tidy Format
tidy_format_sent_merged = tidy_format.merge(vader, how = 'left', left_on = 'word', right_index = True)
tidy_format_sent_merged = tidy_format_sent_merged.loc[:, ['id','num', 'word', 'polarity']]
tidy_format_sent_merged = tidy_format_sent_merged.fillna(0)
# Summing together the id of the message to get a sum of the polarity scores for each word
grouped_tidy_format_sent_merged = tidy_format_sent_merged.groupby('id').agg(sum)
# Adding a column called polarity to our Data Set
# Merged on Id
# Polarity of the message
data['polarity'] = tidy_format_sent_merged.groupby('id').agg({'polarity':'sum'})
print('Most negative messages:')
for m in data.sort_values('polarity').head()['Message']:
print('\n ', m)
print('Most positive messages:')
for m in data.sort_values('polarity', ascending = False).head()['Message']:
print('\n ', m)
print(f'The average polarity score for our Conversations is {round(np.mean(data["polarity"]),4)}')
print(f'The standard error or varying scores of our polairty scores is {round(np.std(data["polarity"]),3)}')
This indicates that on average our conversavtions are more postive than negative, but we can not say that with much strength because our polarity scores vary across our sample of possible messages if we were to speak to everyone on campus.
plt.figure(figsize=(10,8),edgecolor='blue')
plt.hist(data["polarity"], bins = 10)
#plot the mean and median of the heights
#median in red
#mean in gold
plt.scatter(np.mean(data['polarity']), 0, color='gold', s=200, edgecolors="black");
plt.title("Polarity Distribution")
plt.xlabel("Polarity")
plt.ylabel("Count")
def bootstrap_mean(original_sample, label, replications):
"""Returns an list of bootstrapped sample means:
original_sample: table containing the original sample
label: label of column containing the variable
replications: number of bootstrap samples
"""
just_one_column = original_sample[label]
means = []
for i in np.arange(replications):
bootstrap_sample = just_one_column.sample()
resampled_mean = np.mean(bootstrap_sample)
means = np.append(means, resampled_mean)
return means
resampled_means = bootstrap_mean(data, "polarity", 1000)
plt.figure(figsize=(10,8),facecolor='violet',edgecolor='blue')
plt.hist(resampled_means, bins = 10)
#plot the mean and median of the heights
#median in red
#mean in gold
plt.scatter(np.mean(data['polarity']), 0, color='gold', s=200, edgecolors="black");
plt.title("Bootstrapped Polarity (1000 Iterations)")
plt.xlabel("Polarity")
plt.ylabel("Count")
data["Response Group"].unique()
Campus = data[data["Response Group"] == 'Campus']
print('Most positive Campus messages:')
for m in Campus.sort_values('polarity', ascending = False).head()['Message']:
print('\n ', m)
print('Most negative Campus messages:')
for m in Campus.sort_values('polarity').head()['Message']:
print('\n ', m)
Dining = data[data["Response Group"] == 'Dining']
print('Most positive Dining messages:')
for m in Dining.sort_values('polarity', ascending = False).head()['Message']:
print('\n ', m)
print('Most negative Dining messages:')
for m in Dining.sort_values('polarity').head()['Message']:
print('\n ', m)
Libraries = data[data["Response Group"] == 'Libraries']
print('Most positive Libraries messages:')
for m in Libraries.sort_values('polarity', ascending = False).head()['Message']:
print('\n ', m)
print('Most negative Libraries messages:')
for m in Libraries.sort_values('polarity').head()['Message']:
print('\n ', m)
Global_Services = data[data["Response Group"] == 'Global Services']
print('Most positive Global Services messages:')
for m in Global_Services.sort_values('polarity', ascending = False).head()['Message']:
print('\n ', m)
print('Most negative Global Services messages:')
for m in Global_Services.sort_values('polarity').head()['Message']:
print('\n ', m)
message_type_count = data.groupby(["Message Type"]).count()
data.drop(data.loc[data["Message Type"] == "Unsure"].index, inplace = True)
message_type_count = data.groupby(["Message Type"]).count()
x = ["Button", "Freeform"]
y = message_type_count['Message']
sns.barplot(x, y)
sns.set(rc={'figure.figsize':(8,5)})
sns.set(font_scale = 2)
plt.xlabel('Message Type', fontsize = 20)
plt.ylabel('Count of Messages', fontsize=20)
plt.title("Bobchat: Message Type For Conversation", fontsize = 20)
plt.show()
data["Response Accuracy"].unique()
series = data.groupby(["Message Type","Response Accuracy"]).count()
pivot_MessageType_Accuracy_count = pd.pivot_table(
data, index='Message Type', columns="Response Accuracy", values= 'Message', aggfunc= len)
pivot_MessageType_Accuracy_count
pd.crosstab(data['Message Type'],data['Response Accuracy']).plot.bar()
sns.set(rc={'figure.figsize':(15,8)})
sns.set(font_scale = 2)
plt.xlabel('Message Type', fontsize = 20 )
plt.xticks(rotation=0)
plt.ylabel('Proportion of Messages', fontsize= 20)
plt.title("Bobchat: Message Type For Conversation By Accuracy of Response", fontsize = 20)
plt.show()
freeform_data = data[data["Message Type"] == "Freeform"]
none = (freeform_data[freeform_data["Response Accuracy"] == 'None'].count()/ len(freeform_data) ) * 100
partial = (freeform_data[freeform_data["Response Accuracy"] == 'Partial'].count()/ len(freeform_data) ) * 100
full = (freeform_data[freeform_data["Response Accuracy"] == 'Full'].count()/ len(freeform_data) ) * 100
print("Freeform Accuracy Percentage \nNone:",
int(none["Message"]), "\nPartial:",
int(partial["Message"]), "\nFull:",
int(full["Message"]))
new = freeform_data[(freeform_data["Response Accuracy"] == "None") ]
improved = new[["Fixed or Improved","Message"]].groupby("Fixed or Improved").count()
x = ["Did not Improve", "Improved"]
sns.barplot(x, improved["Message"])
sns.set(rc={'figure.figsize':(8,5)})
sns.set(font_scale = 2)
plt.xlabel('Accuracy Improvement', fontsize = 20)
plt.ylabel('Count of Messages', fontsize=20)
plt.title("Bobchat Upgrade Assesment on Previous Innaccurate Messages", fontsize = 20)
Res_Group = data.groupby(["Response Group"]).count()
display = data[["Response Group", "Message"]]
display = display.groupby(["Response Group"]).count()
display.sort_values('Message', ascending = False)[:5]
response_bar = display.sort_values('Message', ascending = False)
response_bar = response_bar.reset_index("Response Group")
clrs = ['red' if (x == "Fallback") else 'blue' for x in response_bar['Response Group'] ]
#sb.barplot(x=idx, y=values, palette=clrs) # color=clrs)
sns.barplot(x = response_bar["Response Group"], y = response_bar["Message"], palette = clrs)
sns.set(rc={'figure.figsize':(15,10)})
sns.set(font_scale = 2)
plt.xlabel('Response Group', fontsize = 20 )
plt.xticks(rotation=90)
plt.ylabel('Count of Messages', fontsize= 20)
plt.title("Bobchat: Response Group By Message Count", fontsize = 20)
plt.show()
#response_bar.head()
The fallback is a response type when the chatbot does not understand the question or cannot find a response to fit the question. It returns a standardized message asking the user if it would like to speak to a human.
# Subset the data to only messages of fallback
fallback = data[data["Response Group"] == "Fallback"]
print( f' There are {fallback.shape[0]} messages that received a Fallback Response')
# Splitting the string type messages, then counting them
Fallback_top_words = fallback['Message'].str.split(expand = True).stack().value_counts()
Fallback_top_words_20 = Fallback_top_words[:20]
ax = Fallback_top_words_20.plot(kind ='bar', figsize = (15,10), rot = 60)
ax.set_alpha(.20)
ax.set_title("Most common words in messages that received a Fallback Response", fontsize = 20)
ax.set_ylabel("Count")
ax.set_xlabel("Words")
#create a list to collect the plt.patches data
totals = []
#find the values and append to list
for i in ax.patches:
totals.append(i.get_height())
#set individual bar lables using above list
total = sum(totals)
#set individual bar lables using above list
for i in ax.patches:
#get_x pulls left or right ; get_height pushes up or down
ax.text(i.get_x()-.06, i.get_height()+.06, \
str(round((i.get_height()/total)*100,1))+'%', fontsize=15, color = 'black')
print('Most negative Fallback messages:')
for m in fallback.sort_values('polarity').head()['Message']:
print('\n ', m)
print('Most positive Fallback messages:')
for m in fallback.sort_values('polarity', ascending = False)['Message'][1:6]:
print('\n ', m)
# Data Entered Negative Sentiment
display2 = display = data[["Response Group", "Message", "Negative Sentiment"]]
display2 = display.groupby(["Response Group"]).agg(sum)
display2.sort_values('Negative Sentiment', ascending = False)[2:7]
# Vader Sentiment
polarity_sort = data[["Response Group", "Message", "polarity"]]
polarity_sort= data.groupby(["Response Group"]).agg("mean")
polarity_sort.sort_values('polarity')[0:5]
polarity_sort.sort_values('polarity', ascending = False)[0:5]
Res_Type = data[["Response Type", "Message", "Negative Sentiment", "polarity"]]
Res_Type_group = Res_Type[["Response Type", "Message"]].groupby(["Response Type"]).count()
Res_Type_group.sort_values('Message', ascending = False)[0:5]
Res_Type_group = Res_Type[["Response Type", "Negative Sentiment"]].groupby(["Response Type"]).sum()
Res_Type_group.sort_values('Negative Sentiment', ascending = False)[0:5]
Res_Type_pol = Res_Type.groupby(["Response Type"]).agg("mean")
Res_Type_pol.sort_values('polarity')[0:5]
Res_Type_pol.sort_values('polarity', ascending = False)[0:5]
MessageType_ResponseType = pd.pivot_table(data, index='Response Type', columns='Message Type',
values='Message', aggfunc=len)
MessageType_ResponseType = MessageType_ResponseType.fillna(0)
#MessageType_ResponseType.plot(kind= 'bar', figsize = (10,8))
#plt.title("Registered Names vs Year Stratified by Gender")
#plt.gca().set_ylabel('Names Registered that Year');
data.corr()
With data as it currently stands there does not appear to be any correlations, but we can adjust the data types
We need to change some of the values in the data set so that they are quantitative. pd.get_dummies creates 1 or 0 values for each unique category in a column.
Below I have choosen to add Response Type, Response Group , Message Type, Negative Sentiment, Word Count, Hour and Day of the week to predict polarity.
one_hot = pd.get_dummies(data[["Response Type",
"Response Group",
"Message Type",
"Negative Sentiment",
"word_count",
"polarity",
"hour",
"day_of_week"]])
one_hot.dropna(inplace = True)
correlation = one_hot.corr()
large_corr = correlation[abs(correlation["polarity"]) > .2]
Nothing seems to be that highly correlated with polarity. We will move on none the less
# Create training and test sets
# Droping desired outcome polarity
X = one_hot.drop(columns=['polarity'])
Y = one_hot['polarity']
# Creating sets: Train 90 , Test 10
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.10)
# Create model off of training set
model = LinearRegression().fit(X_train, Y_train)
# create predictors
Y_pred = model.predict(X_test)
# find residuals
residuals = Y_pred - Y_test
plt.scatter(Y_test, residuals, s = 40)
sns.set(rc={'figure.figsize':(8,5)})
sns.set(font_scale = 2)
plt.axhline(0, color = 'g')
plt.title("Observed Ys and Residuals", fontsize = 20)
plt.ylabel("Residuals", fontsize = 20)
plt.xlabel("Observed Ys", fontsize = 20)
Turns out we can't predict polarity that well without the message or response.
def mse(actual_y, predicted_y):
return np.mean((predicted_y - actual_y)**2)
def mae(actual_y, predicted_y):
return np.mean(abs(predicted_y - actual_y))
train_error = mse(Y_train, model.predict(X_train))
test_error = mse(Y_test,model.predict(X_test))
print(train_error, test_error)
train_error = mae(Y_train, model.predict(X_train))
test_error = mae(Y_test,model.predict(X_test))
print(train_error, test_error)
This is a data entry by the department and not something viewed from the data. Lets remove it and see how we do.
one_hot2 = pd.get_dummies(data[["Response Type",
"Response Group",
"Message Type",
"word_count",
"polarity",
"hour",
"day_of_week"]])
one_hot2.dropna(inplace = True)
# Create training and test sets
# Droping desired outcome polarity
X2 = one_hot2.drop(columns=['polarity'])
Y2 = one_hot2['polarity']
# Creating sets: Train 90 , Test 10
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size = 0.10)
# Create model off of training set
model = LinearRegression().fit(X_train2, Y_train2)
# create predictors
Y_pred2 = model.predict(X_test2)
# find residuals
residuals2 = Y_pred2 - Y_test2
plt.scatter(Y_test2, residuals, s = 40)
plt.axhline(0, color = 'g')
plt.title("Observed Ys and Residuals", fontsize = 20)
plt.ylabel("Residuals", fontsize = 20)
plt.xlabel("Observed Ys", fontsize = 20)
train_error2 = mse(Y_train2, model.predict(X_train2))
test_error2 = mse(Y_test2,model.predict(X_test2))
print(train_error2, test_error2)
Removing negative sentiment increased the training error but reduced the test error.
sns.scatterplot(x = 'polarity', y= 'word_count', data = data, y_jitter = .2, x_jitter = .2)
sns.set(rc={'figure.figsize':(8,6)})
plt.title("Word Count and Polarity", fontsize = 20)
sns.set(font_scale = 2)
#Conducitng a linear regression with word_count as the IV
results = smf.ols('polarity ~ word_count', data=data).fit()
results.summary() #Showing the results
Coefficient: If the word_count is changed by 1 there will be a .05 change in the polarity score.
Confidence Interval: We can say that if we did studied the association between word_count and polarity many times the percentage of times the confidence interval will contain the true slope will be 95% of the time.
P value: the probabilty of having a correct NULL hypothesis (Saying there is no relationship between word_count and polarity) is 001. With that information and the Confidence interval we can Reject the Null Hypothesis
BUT because we have a low R-Squared value of .005 that indicates that the word_count is not that good at explaining the variation in the polarity score.
# draw a scatter plot
groups = data[["polarity", "Response Accuracy", "word_count"]].groupby('Response Accuracy')
# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
ax.plot(group.polarity, group.word_count, marker='o', linestyle='', ms=12, label=name)
ax.legend()
plt.title("Word Count, Polarity, Response Accuracy", fontsize = 20)
plt.ylabel("Word Count", fontsize = 20)
plt.xlabel("Polarity", fontsize = 20)
plt.show()
#plt.scatter(data['polarity'], data['word_count'], hue=data['Response Accuracy'])
# create decision boundary
# first let's make a bunch of points that we need to classify
#x_array = np.array()
#y_array = np.array()
#for x in np.arange(-2, 2.1, 0.1):
# for y in np.arange(-2, 2.1, 0.1):
# x_array = np.append(x_array, x)
# y_array = np.append(y_array, y)
#
#test_grid = Table().with_columns(
# 'Polarity', x_array,
# 'word_count', y_array
#)
# make function to classify new people
#def classify_grid(training, test, k):
# c = make_array()
# for i in range(test.num_rows):
# # Run the classifier on the ith patient in the test set
# c = np.append(c, classify(training, make_array(test.row(i)), k))
#return c
#classify all the points with respect to their nearest neighbor
# (note need to drop color/status variables here)
#c = classify_grid(data.drop('Accuracy','Color'), test_grid, 1)
##now draw the test set in terms of its colors (based on NN), but wash it out w alpha coloring
#test_grid.scatter('Age', 'Sodium', group='status', alpha=0.4, s=30)
# on top of that, plot the *actual* points
#plots.scatter(ckd_small.column('Age'), ckd_small.column('Sodium'), c=ckd_small.column('Color'), edgecolor='k')
#plots.xlim(-2, 2)
#plots.ylim(-2, 2); #the boundary is being drawn in the restricted -2,2 space