import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
import nltk
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import re
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import statsmodels.api as sm
import statsmodels.formula.api as smf # importing stats model api for multiple regression

Data Cleaning¶

Stopwords¶

These are words that do not themselves have significant meaning for polarity purposes.

# Download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Mackdig25/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True

#example of stopwords
sw = stopwords.words('english')
sw[:5]

['i', 'me', 'my', 'myself', 'we']

Useful Functions for Data Cleaning¶

def stopwords(text):
    '''a function for removing the stopword'''
    #removing the stop words and lowercasing the selected words
    #text = [str(word)for word in text if type(word) == float]
    text = [word.lower() for word in str(text).split() if word.lower() not in sw]
    #joining the list of words with space seperator
    return ' '.join(text)

def remove_emoji(string):
    """a function to remove emojis"""
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# Reading in Raw Data
data_rough = pd.read_csv('BobChat Conversations - All Data.csv')

Cleaning Messages for Analysis¶

Remove Punctuation¶

data = data_rough.copy()
data['Message'] = data['Message'].str.replace(',', '') # Remove Commas
data['Message'] = data['Message'].str.replace('.', '') # Remove Periods
data['Message'] = data['Message'].str.replace('"', '') # Removes Quotes
data['Message'] = data['Message'].str.replace(';', '') # Remove Semicolon
data['Message'] = data['Message'].str.replace(':', '') # Remove Colon
data['Message'] = data['Message'].str.replace('?', '') # Remove Question mark
data['Message'] = data['Message'].str.replace('&', '') # Remove And Symbol
data['Message'] = data['Message'].str.replace('!', '') # Remove Exclimation Point
data['Message'] = data['Message'].str.replace('-', ' ') # Remove Dash Point
data['Message'] = data['Message'].str.replace('_', ' ') # Remove Underscore Point

Distribution of Message Length Before Cleaning¶

sns.distplot(data_rough['Message'].str.len(),
             rug=True,
             bins=np.arange(0, 70,4),
             axlabel="Number of Characters (By Four)", color = "gold"
            )

sns.set(rc={'figure.figsize':(16,16)})  
sns.set(font_scale = 2)
plt.xlim(0, 70)
plt.xticks(np.arange(0,70,4))
plt.gca().set_ylabel('Proportion per character')
plt.title("Before Cleaning Character Proportion");

Applying Cleaning Functions¶

# Applying remove stop words function to messages to remove stopwords
data['Message'] = data['Message'].apply(stopwords) 

# Applying remove_emogi function to messages to remove emojis
data['Message'] = data['Message'].apply(remove_emoji)

sns.distplot(data['Message'].str.len(),
             rug=True,
             bins=np.arange(0, 70,4),
             axlabel="Number of Characters (By Four Characters)"
            )

sns.set(rc={'figure.figsize':(14,14)})  
sns.set(font_scale = 2)
plt.xlim(0, 70)
plt.xticks(np.arange(0,70,4))
plt.gca().set_ylabel('Proportion per character')
plt.title("Post Cleaning Character Proportion");

sns.distplot(data_rough['Message'].str.len(),
             rug=True,
             bins=np.arange(0, 70,4),
             axlabel="Number of Characters (By Four)", color = "yellow"
            )
sns.distplot(data['Message'].str.len(),
             rug=True,
             bins=np.arange(0, 70,4),
             axlabel="Number of Characters (By Four Characters)"
            )
sns.set(rc={'figure.figsize':(10,10)})  
sns.set(font_scale = 2)
plt.xlim(0, 70)
plt.legend(labels = ["Pre-cleaning", "Post-cleaning"])
plt.xticks(np.arange(0,70,4))
plt.gca().set_ylabel('Proportion per character')
plt.title("Pre vs Post Cleaning Character Proportion")

Text(0.5, 1.0, 'Pre vs Post Cleaning Character Proportion')

Converting Timestamp from string to datetime¶

data['Timestamp']= pd.to_datetime(data['Timestamp'])

Analysis¶

Data Set Description¶

Size of Data Set¶

print(f"There are {data.shape[0]} Rows and {data.shape[1]} Columns currently in the Data Set")

There are 6427 Rows and 11 Columns currently in the Data Set

Data Creation¶

Word Count¶

# Applying a fucntion to fucntion to count the number of words in a message post cleaning

data["word_count"] = data['Message'].apply(lambda x: len(str(x).split(" ")))

Day of the Week¶

#['weekday'].replace({0: 'Sun', 1 : 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat'}, inplace = True)

#data['weekeday'] = data[data['Timestamp']].weekday()
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

data['day_of_week'] = data['Timestamp'].dt.day_name()

Hour column¶

data['hour'] = round((data["Timestamp"].dt.hour) + (data["Timestamp"].dt.minute)/60 + (data["Timestamp"].dt.second)/60**2)

Top words¶

Using sort we can find the most common words used in our dataset post cleaning.

# Splitting the string type messages, then counting them
top_words = data['Message'].str.split(expand = True).stack().value_counts()
top_20 = top_words[:20]

Visual for most common words¶

ax = top_20.plot(kind ='bar', figsize = (15,10), rot = 60)

ax.set_alpha(.20)
ax.set_title("Most common words in the Bobchat Dataset", fontsize = 20)
ax.set_ylabel("Count")
ax.set_xlabel("Words")


#create a list to collect the plt.patches data
totals = []

#find the values and append to list
for i in ax.patches:
    totals.append(i.get_height())
    
#set individual bar lables using above list
total = sum(totals)

#set individual bar lables using above list
for i in ax.patches:
    #get_x pulls left or right ; get_height pushes up or down
    ax.text(i.get_x()-.06, i.get_height()+.06, \
           str(round((i.get_height()/total)*100,1))+'%', fontsize=15, color = 'black')

Most common Pairs of word¶

def find_ngrams(input_list, n):
    '''a function for joining a number of words together'''
    return list(zip(*[input_list[i:] for i in range(n)]))
from collections import Counter
from itertools import chain

pairs = data['Message'].map(lambda x: find_ngrams(x.split(' '), 2)).to_frame()
pairs
new_pairs = []
for p in pairs["Message"]:
    if p != []:
        new_pairs.append(p)

drop_word = ''
newest_pairs = []
for pair in new_pairs:
    
    if drop_word in chain(*pair):
        continue
    else:
        newest_pairs.append(pair)

newest_pairs

# Bigram Frequency Counts
bigrams = newest_pairs
bigrams = list(chain(*bigrams))
bigrams = [(x.lower(), y.lower()) for x,y in bigrams]


bigram_counts = Counter(bigrams)


bigram_counts = bigram_counts.most_common()
top20_bigram = bigram_counts[:20]
keys = [b[0] for b in top20_bigram]

values = [b[1] for b in top20_bigram]
x = []
for t in keys:
    x.append(" ".join(t))
x
plt.bar(x, values)
sns.set(rc={'figure.figsize':(18,10)})  
sns.set(font_scale = 2)

plt.xticks(range(20),rotation=75)
plt.xlabel('Bigrams', fontsize=30)
plt.ylabel('Count', fontsize=30)
plt.title("Bobchat: Most common appearance of words together", fontsize=30)
plt.show()

What is the Most Active Hour For Conversations ?¶

series_hour = data['hour'].sort_values()
sns.distplot(series_hour, hist=True, color = "red")    
sns.set(rc={'figure.figsize':(15,10)})  
sns.set(font_scale = 2)
plt.xlabel('Hour', fontsize=30)
plt.ylabel('Proportion', fontsize=30)
plt.title("Bobchat: Most Active Hour For Conversation", fontsize=30)
plt.show()
;

''

#message_hour_group = data[["Message", "hour"]].groupby("hour")
#message_hour_group.str.split(expand = True).stack().value_counts().split(expand = True).stack().value_counts()

What is the most active date for a conversation?¶

data["Date"]= [d.date() for d in data["Timestamp"]]

date_message_count = data[["Message", "Date"]].groupby("Date").count()
date_message_count.reset_index(inplace=True)
date_message_count = date_message_count[(date_message_count["Date"] <= datetime.date(2019,3,30)) & (date_message_count["Date"] >= datetime.date(2018,12,1))]
date_message_count["Date"].min()

datetime.date(2019, 2, 12)

sns.lineplot(date_message_count["Date"], date_message_count["Message"] ) 
#sns.plot(date_message_count["Date"], date_message_count["Message"], kind ="bar")

sns.set(font_scale = 2)
plt.xlabel('Date', fontsize=30)
plt.xticks(rotation=45)
plt.ylabel('Message Count', fontsize=30)
plt.title("Bobchat: Conversation Over Time", fontsize=30)
plt.show()
;

''

What is the most active day of the week for conversations?¶

series_day = data.groupby(["day_of_week"]).count()

x = ['Sunday','Monday','Tuesday','Wednesday', 'Thursday', 'Friday','Saturday']
ax = sns.barplot(x, series_day['Message'])
sns.set(rc={'figure.figsize':(15,10)})  
sns.set(font_scale = 2)

plt.xlabel('Day of the week', fontsize = 20)
plt.ylabel('Proportion of Messages', fontsize = 20)
plt.title("Bobchat: Most Active Day of the week For Conversation", fontsize = 30)
#create a list to collect the plt.patches data
totals = []

#find the values and append to list
for i in ax.patches:
    totals.append(i.get_height())
    
#set individual bar lables using above list
total = sum(totals)

#set individual bar lables using above list
for i in ax.patches:
    #get_x pulls left or right ; get_height pushes up or down
    ax.text(i.get_x()+ .2, i.get_height()+.1, \
           str(round((i.get_height()/total)*100,1))+'%', fontsize=20, color = 'black')
plt.show()

Sentiment Analysis of conversation using Vader¶

We will use the VADER (Valence Aware Dictionary and Sentiment Reasoner) lexicon to analyze the sentiment of Trump's tweets. VADER is a lexicon and rule-based sentiment analysis tool that is specifically useful for sentiments in social media. The VADER lexicon gives the sentiment of individual words.

# Getting Vader Data Set
url = 'https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt'

vader = pd.read_csv(url, sep = '\t', header = None )
#Renaming Columns
vader = vader.rename(columns = {1:'polarity', 0 : 'token', 2:'standard dev', 3: 'raw scores'})
#Indexing by Token
vader = vader.set_index('token')

Getting Data Ready for Polarity¶

# Creating an Id column
data['id'] = list(range(len(data)))

#Setting Index to Id
data = data.set_index('id')

# Subset of data
# Splitting Messages and explanding them so each word is a column
message_split = data['Message'].str.split(expand = True)

# Inverting the Subset
numbered_columns = message_split.columns.values
# resetting index, removing it from id
message_split.reset_index(inplace = True)
# Creating Tidy format by merging the numbered columns with the Message split table
tidy_format = pd.melt(message_split, id_vars=['id'], value_vars=numbered_columns)

# Renaming Columsn for join to match vader set
tidy_format = tidy_format.rename(columns = {'variable' : 'num', 'value' : 'word'})

tidy_format = tidy_format.dropna()

tidy_format = tidy_format.sort_values(['id', 'num'])

# joining vader and Tidy Format
tidy_format_sent_merged = tidy_format.merge(vader, how = 'left', left_on = 'word', right_index = True)

tidy_format_sent_merged = tidy_format_sent_merged.loc[:, ['id','num', 'word', 'polarity']]

tidy_format_sent_merged = tidy_format_sent_merged.fillna(0)

# Summing together the id of the message to get a sum of the polarity scores for each word
grouped_tidy_format_sent_merged = tidy_format_sent_merged.groupby('id').agg(sum)

# Adding a column called polarity to our Data Set 
# Merged on Id
# Polarity of the message
data['polarity'] = tidy_format_sent_merged.groupby('id').agg({'polarity':'sum'})

print('Most negative messages:')
for m in data.sort_values('polarity').head()['Message']:
    print('\n  ', m)

Most negative messages:

   i’ll kill i’m going kill

   nyu report suicide sexual assaults happen campus apartments

   nyu going address sexual assault problem campus anytime soon

   kill

   wanna kill

print('Most positive messages:')
for m in data.sort_values('polarity', ascending = False).head()['Message']:
    print('\n  ', m)

Most positive messages:

   please i'm planning come nyu finish current studies(i'm left one year still bachelor human resources development major psychology minor) question requirements need order get admitted nyu along getting scholarship please keep mind i'm honors student current university active strong gpa

   best place poop peace

   yes help make friends

   i’m strongly looking oncampus employment careernet applied everywhere got response kindly help find one

   alumni would like know request immunization records best way go registrar’s office

What is the Average Polarity Score for our Data?¶

print(f'The average polarity score for our Conversations is {round(np.mean(data["polarity"]),4)}')
print(f'The standard error or varying scores of our polairty scores is {round(np.std(data["polarity"]),3)}')

The average polarity score for our Conversations is 0.0738
The standard error or varying scores of our polairty scores is 0.691

This indicates that on average our conversavtions are more postive than negative, but we can not say that with much strength because our polarity scores vary across our sample of possible messages if we were to speak to everyone on campus.

plt.figure(figsize=(10,8),edgecolor='blue')
plt.hist(data["polarity"], bins = 10)

#plot the mean and median of the heights
#median in red
#mean in gold
plt.scatter(np.mean(data['polarity']), 0, color='gold', s=200, edgecolors="black");
plt.title("Polarity Distribution")
plt.xlabel("Polarity")
plt.ylabel("Count")

/Users/Mackdig25/opt/anaconda3/lib/python3.7/site-packages/numpy/lib/histograms.py:839: RuntimeWarning: invalid value encountered in greater_equal
  keep = (tmp_a >= first_edge)
/Users/Mackdig25/opt/anaconda3/lib/python3.7/site-packages/numpy/lib/histograms.py:840: RuntimeWarning: invalid value encountered in less_equal
  keep &= (tmp_a <= last_edge)

Text(0, 0.5, 'Count')

What would speaking to everyone on campus look like?¶

def bootstrap_mean(original_sample, label, replications):
    
    """Returns an list of bootstrapped sample means:
    original_sample: table containing the original sample
    label: label of column containing the variable
    replications: number of bootstrap samples
    """
    
    just_one_column = original_sample[label]
    means = []
    for i in np.arange(replications):
        bootstrap_sample = just_one_column.sample()
        resampled_mean = np.mean(bootstrap_sample)
        means = np.append(means, resampled_mean)
        
    return means

resampled_means = bootstrap_mean(data, "polarity", 1000)
plt.figure(figsize=(10,8),facecolor='violet',edgecolor='blue')
plt.hist(resampled_means, bins = 10)

#plot the mean and median of the heights
#median in red
#mean in gold
plt.scatter(np.mean(data['polarity']), 0, color='gold', s=200, edgecolors="black");
plt.title("Bootstrapped Polarity (1000 Iterations)")
plt.xlabel("Polarity")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

Messages by Response Group¶

data["Response Group"].unique()

array(['Small Talk', 'Emergency', 'Development', 'Fallback',
       'Public Safety', 'Miscellaneous', 'Wasserman', 'Global Services',
       'Global Spiritual Life', 'Dining', 'Athletics', 'Libraries',
       'Feedback', 'Academics', 'Human', 'Campus', 'Housing', 'NYC',
       'Center for Student Life', 'ULC/ARC', 'NYU Traditions',
       'Admissions', 'Student Government', 'Redirects', 'Majors',
       'Special Sessions', 'Weather', 'Student Health Center',
       'Registrar', 'Study Away', 'Finances', 'Shanghai', 'Food & Drink',
       'StudentLink', 'Entrepreneurship', 'Seasonal', 'Schools', 'Alumni',
       'Abu Dhabi', 'Administration', 'LGBTQ+'], dtype=object)

Campus = data[data["Response Group"] == 'Campus']
print('Most positive Campus messages:')
for m in Campus.sort_values('polarity', ascending = False).head()['Message']:
    print('\n  ', m)

Most positive Campus messages:

   please i'm planning come nyu finish current studies(i'm left one year still bachelor human resources development major psychology minor) question requirements need order get admitted nyu along getting scholarship please keep mind i'm honors student current university active strong gpa

   best place nap

   best nap spots

   best nap spots

   best place study

print('Most negative Campus messages:')
for m in Campus.sort_values('polarity').head()['Message']:
    print('\n  ', m)

Most negative Campus messages:

   i’m failing college

   i'm failing classes

   printer palladium always broken

   i’m lost brooklyn campus

   i’m lost bobst

Dining = data[data["Response Group"] == 'Dining']
print('Most positive Dining messages:')
for m in Dining.sort_values('polarity', ascending = False).head()['Message']:
    print('\n  ', m)

Most positive Dining messages:

   best dining hall

   best dining hall

   what's best dining hall

   dining hall best

   urgent care close

print('Most negative Dining messages:')
for m in Dining.sort_values('polarity').head()['Message']:
    print('\n  ', m)

Most negative Dining messages:

   depression

   depression

   think depression

   think depression

   food insecurity

Libraries = data[data["Response Group"] == 'Libraries']
print('Most positive Libraries messages:')
for m in Libraries.sort_values('polarity', ascending = False).head()['Message']:
    print('\n  ', m)

Most positive Libraries messages:

   yes please

   yes please

   yes please

   yes please

   yes please

print('Most negative Libraries messages:')
for m in Libraries.sort_values('polarity').head()['Message']:
    print('\n  ', m)

Most negative Libraries messages:

   bobst wifi suck

   time bobst closed today

   bobst closed today

   open

   dibner library

Global_Services = data[data["Response Group"] == 'Global Services']
print('Most positive Global Services messages:')
for m in Global_Services.sort_values('polarity', ascending = False).head()['Message']:
    print('\n  ', m)

Most positive Global Services messages:

   security help

   share social security number

   need security

   important dates freshman

   important info

print('Most negative Global Services messages:')
for m in Global_Services.sort_values('polarity').head()['Message']:
    print('\n  ', m)

Most negative Global Services messages:

   opt workshop

   postgrad info

   need visa i20 qualified applying dormitory

    global services

    global services

These don't seem to be that helpful. I am seeing the faults in the Vader polarity grading system.¶

Freeform vs. Button¶

message_type_count = data.groupby(["Message Type"]).count()

Removing Unsure outliers¶

data.drop(data.loc[data["Message Type"] == "Unsure"].index, inplace = True)
message_type_count = data.groupby(["Message Type"]).count()

x = ["Button", "Freeform"]
y = message_type_count['Message']
sns.barplot(x, y)    
sns.set(rc={'figure.figsize':(8,5)})
sns.set(font_scale = 2)
plt.xlabel('Message Type', fontsize = 20)
plt.ylabel('Count of Messages', fontsize=20)
plt.title("Bobchat: Message Type For Conversation", fontsize = 20)
plt.show()

data["Response Accuracy"].unique()

array(['Full', 'None', 'Partial'], dtype=object)

series = data.groupby(["Message Type","Response Accuracy"]).count()

pivot_MessageType_Accuracy_count = pd.pivot_table(
    data, index='Message Type', columns="Response Accuracy", values= 'Message', aggfunc= len)
pivot_MessageType_Accuracy_count

pd.crosstab(data['Message Type'],data['Response Accuracy']).plot.bar()
sns.set(rc={'figure.figsize':(15,8)})
sns.set(font_scale = 2)
plt.xlabel('Message Type', fontsize = 20 )
plt.xticks(rotation=0)
plt.ylabel('Proportion of Messages', fontsize= 20)
plt.title("Bobchat: Message Type For Conversation By Accuracy of Response", fontsize = 20)
plt.show()

Freeform accuracy rate¶

freeform_data = data[data["Message Type"] == "Freeform"]
none = (freeform_data[freeform_data["Response Accuracy"] == 'None'].count()/ len(freeform_data) ) * 100
partial = (freeform_data[freeform_data["Response Accuracy"] == 'Partial'].count()/ len(freeform_data) ) * 100
full = (freeform_data[freeform_data["Response Accuracy"] == 'Full'].count()/ len(freeform_data) ) * 100
print("Freeform Accuracy Percentage \nNone:",
      int(none["Message"]), "\nPartial:",
      int(partial["Message"]), "\nFull:",
      int(full["Message"]))

Freeform Accuracy Percentage 
None: 39 
Partial: 12 
Full: 47

new = freeform_data[(freeform_data["Response Accuracy"] == "None") ]
improved = new[["Fixed or Improved","Message"]].groupby("Fixed or Improved").count()
x = ["Did not Improve", "Improved"]
sns.barplot(x, improved["Message"])
sns.set(rc={'figure.figsize':(8,5)})
sns.set(font_scale = 2)
plt.xlabel('Accuracy Improvement', fontsize = 20)
plt.ylabel('Count of Messages', fontsize=20)
plt.title("Bobchat Upgrade Assesment on Previous Innaccurate Messages", fontsize = 20)

Text(0.5, 1.0, 'Bobchat Upgrade Assesment on Previous Innaccurate Messages')

Respone Group¶

Res_Group = data.groupby(["Response Group"]).count()

Top Five Response Group¶

display = data[["Response Group", "Message"]]
display = display.groupby(["Response Group"]).count()
display.sort_values('Message', ascending = False)[:5]

response_bar = display.sort_values('Message', ascending = False)
response_bar = response_bar.reset_index("Response Group")
clrs = ['red' if (x == "Fallback") else 'blue' for x in response_bar['Response Group'] ]
#sb.barplot(x=idx, y=values, palette=clrs) # color=clrs)
sns.barplot(x = response_bar["Response Group"], y = response_bar["Message"], palette = clrs)
sns.set(rc={'figure.figsize':(15,10)})
sns.set(font_scale = 2)
plt.xlabel('Response Group', fontsize = 20 )
plt.xticks(rotation=90)
plt.ylabel('Count of Messages', fontsize= 20)
plt.title("Bobchat: Response Group By Message Count", fontsize = 20)
plt.show()
#response_bar.head()

Fallback Group¶

The fallback is a response type when the chatbot does not understand the question or cannot find a response to fit the question. It returns a standardized message asking the user if it would like to speak to a human.

# Subset the data to only messages of fallback
fallback = data[data["Response Group"] == "Fallback"]
print( f' There are {fallback.shape[0]} messages that received a Fallback Response')

 There are 415 messages that received a Fallback Response

Most common word¶

# Splitting the string type messages, then counting them
Fallback_top_words = fallback['Message'].str.split(expand = True).stack().value_counts()
Fallback_top_words_20 = Fallback_top_words[:20]

ax = Fallback_top_words_20.plot(kind ='bar', figsize = (15,10), rot = 60)

ax.set_alpha(.20)
ax.set_title("Most common words in messages that received a Fallback Response", fontsize = 20)
ax.set_ylabel("Count")
ax.set_xlabel("Words")


#create a list to collect the plt.patches data
totals = []

#find the values and append to list
for i in ax.patches:
    totals.append(i.get_height())
    
#set individual bar lables using above list
total = sum(totals)

#set individual bar lables using above list
for i in ax.patches:
    #get_x pulls left or right ; get_height pushes up or down
    ax.text(i.get_x()-.06, i.get_height()+.06, \
           str(round((i.get_height()/total)*100,1))+'%', fontsize=15, color = 'black')

print('Most negative Fallback messages:')
for m in fallback.sort_values('polarity').head()['Message']:
    print('\n  ', m)

Most negative Fallback messages:

   nyu racist

   nyu racist

   nyu setup campuses places serious human rights abuses

   avoid debt

   hate life

print('Most positive Fallback messages:')
for m in fallback.sort_values('polarity', ascending = False)['Message'][1:6]:
    print('\n  ', m)

Most positive Fallback messages:

   where's best place grab lunch

   best food campus

   what’s best place eat campus

   best bathroom

   best boba

Top Five Negative Response Groups by Data Entry¶

# Data Entered Negative Sentiment
display2 = display = data[["Response Group", "Message", "Negative Sentiment"]]
display2 = display.groupby(["Response Group"]).agg(sum)
display2.sort_values('Negative Sentiment', ascending = False)[2:7]

Top Five Negative Response Groups by Polarity¶

# Vader Sentiment
polarity_sort = data[["Response Group", "Message", "polarity"]]
polarity_sort= data.groupby(["Response Group"]).agg("mean")
polarity_sort.sort_values('polarity')[0:5]

Most Postive Response Groups by polarity¶

polarity_sort.sort_values('polarity', ascending = False)[0:5]

Response Type¶

Res_Type = data[["Response Type", "Message", "Negative Sentiment", "polarity"]]

Res_Type_group = Res_Type[["Response Type", "Message"]].groupby(["Response Type"]).count()

Top Five Most Common Message Type¶

Res_Type_group.sort_values('Message', ascending = False)[0:5]

Top Five Negative Sentiment Resource Types User Data Entry¶

Res_Type_group = Res_Type[["Response Type", "Negative Sentiment"]].groupby(["Response Type"]).sum()
Res_Type_group.sort_values('Negative Sentiment', ascending = False)[0:5]

Top Five Negative Sentiment Resource Type by Polarity¶

Res_Type_pol = Res_Type.groupby(["Response Type"]).agg("mean")
Res_Type_pol.sort_values('polarity')[0:5]

Top five Positive Sentiment Resource Type by Polarity¶

Res_Type_pol.sort_values('polarity', ascending = False)[0:5]

MessageType_ResponseType = pd.pivot_table(data, index='Response Type', columns='Message Type',
    values='Message', aggfunc=len)
MessageType_ResponseType = MessageType_ResponseType.fillna(0)

#MessageType_ResponseType.plot(kind= 'bar', figsize = (10,8))
#plt.title("Registered Names vs Year Stratified by Gender")
#plt.gca().set_ylabel('Names Registered that Year');

Coorleations¶

data.corr()

With data as it currently stands there does not appear to be any correlations, but we can adjust the data types

We need to change some of the values in the data set so that they are quantitative. pd.get_dummies creates 1 or 0 values for each unique category in a column.

Below I have choosen to add Response Type, Response Group , Message Type, Negative Sentiment, Word Count, Hour and Day of the week to predict polarity.

one_hot = pd.get_dummies(data[["Response Type",
                               "Response Group",
                               "Message Type",
                               "Negative Sentiment",
                               "word_count",
                               "polarity", 
                              "hour",
                              "day_of_week"]])
one_hot.dropna(inplace = True)

correlation = one_hot.corr()
large_corr = correlation[abs(correlation["polarity"]) > .2]

Nothing seems to be that highly correlated with polarity. We will move on none the less

Predicting Sentiment¶

Without Message¶

# Create training and test sets

# Droping desired outcome polarity
X = one_hot.drop(columns=['polarity'])
Y = one_hot['polarity']

# Creating sets: Train 90 , Test 10
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.10)

# Create model off of training set
model = LinearRegression().fit(X_train, Y_train)

# create predictors
Y_pred = model.predict(X_test)

# find residuals
residuals = Y_pred - Y_test

plt.scatter(Y_test, residuals, s = 40)
sns.set(rc={'figure.figsize':(8,5)})
sns.set(font_scale = 2)
plt.axhline(0, color = 'g')
plt.title("Observed Ys and Residuals", fontsize = 20)
plt.ylabel("Residuals", fontsize = 20)
plt.xlabel("Observed Ys", fontsize = 20)

Text(0.5, 0, 'Observed Ys')

Turns out we can't predict polarity that well without the message or response.

def mse(actual_y, predicted_y):
    return np.mean((predicted_y - actual_y)**2)
def mae(actual_y, predicted_y):
    return np.mean(abs(predicted_y - actual_y))

train_error = mse(Y_train, model.predict(X_train))
test_error =  mse(Y_test,model.predict(X_test))
print(train_error, test_error)

train_error = mae(Y_train, model.predict(X_train))
test_error =  mae(Y_test,model.predict(X_test))
print(train_error, test_error)

0.2809249058943312 1833634868214563.2
0.24303649859987672 5285878.35918622

Negative Sentiment¶

This is a data entry by the department and not something viewed from the data. Lets remove it and see how we do.

one_hot2 = pd.get_dummies(data[["Response Type",
                               "Response Group",
                               "Message Type",
                               "word_count",
                               "polarity", 
                              "hour",
                              "day_of_week"]])
one_hot2.dropna(inplace = True)

# Create training and test sets

# Droping desired outcome polarity
X2 = one_hot2.drop(columns=['polarity'])
Y2 = one_hot2['polarity']

# Creating sets: Train 90 , Test 10
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size = 0.10)

# Create model off of training set
model = LinearRegression().fit(X_train2, Y_train2)

# create predictors
Y_pred2 = model.predict(X_test2)

# find residuals
residuals2 = Y_pred2 - Y_test2

plt.scatter(Y_test2, residuals, s = 40)
plt.axhline(0, color = 'g')
plt.title("Observed Ys and Residuals", fontsize = 20)
plt.ylabel("Residuals", fontsize = 20)
plt.xlabel("Observed Ys", fontsize = 20)

Text(0.5, 0, 'Observed Ys')

train_error2 = mse(Y_train2, model.predict(X_train2))
test_error2 =  mse(Y_test2,model.predict(X_test2))
print(train_error2, test_error2)

0.3009711578621345 8.905667690238136e+16

Removing negative sentiment increased the training error but reduced the test error.

Linear Regression with Polarity and Word_Count¶

Visualizing the Association between word_count and polarity¶

sns.scatterplot(x = 'polarity', y= 'word_count', data = data, y_jitter = .2, x_jitter = .2)
sns.set(rc={'figure.figsize':(8,6)})
plt.title("Word Count and Polarity", fontsize = 20)
sns.set(font_scale = 2)

#Conducitng a linear regression with word_count as the IV
results = smf.ols('polarity ~ word_count', data=data).fit()
results.summary() #Showing the results

Interpretting the coefficient, confidence interval, and p-value for the coefficient on x.¶

Coefficient: If the word_count is changed by 1 there will be a .05 change in the polarity score.
Confidence Interval: We can say that if we did studied the association between word_count and polarity many times the percentage of times the confidence interval will contain the true slope will be 95% of the time.
- The Confidence Interval of [.022, .09] does not contain 0 so we can reject the null with a 95% confidence.
P value: the probabilty of having a correct NULL hypothesis (Saying there is no relationship between word_count and polarity) is 001. With that information and the Confidence interval we can Reject the Null Hypothesis
BUT because we have a low R-Squared value of .005 that indicates that the word_count is not that good at explaining the variation in the polarity score.

Prediction By Classification¶

# draw a scatter plot

groups = data[["polarity", "Response Accuracy", "word_count"]].groupby('Response Accuracy')

# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group.polarity, group.word_count, marker='o', linestyle='', ms=12, label=name)
ax.legend()

plt.title("Word Count, Polarity, Response Accuracy", fontsize = 20)
plt.ylabel("Word Count", fontsize = 20)
plt.xlabel("Polarity", fontsize = 20)
plt.show()

#plt.scatter(data['polarity'], data['word_count'], hue=data['Response Accuracy'])

# create decision boundary

# first let's make a bunch of points that we need to classify
#x_array = np.array()
#y_array = np.array()
#for x in np.arange(-2, 2.1, 0.1):
#    for y in np.arange(-2, 2.1, 0.1):
#        x_array = np.append(x_array, x)
#        y_array = np.append(y_array, y)
#        
#test_grid = Table().with_columns(
#    'Polarity', x_array,
#    'word_count', y_array
#)

# make function to classify new people
#def classify_grid(training, test, k):
 #   c = make_array()
 #   for i in range(test.num_rows):
  #      # Run the classifier on the ith patient in the test set
   #     c = np.append(c, classify(training, make_array(test.row(i)), k))   
    #return c

#classify all the points with respect to their nearest neighbor 
# (note need to drop color/status variables here)
#c = classify_grid(data.drop('Accuracy','Color'), test_grid, 1)

##now draw the test set in terms of its colors (based on NN), but wash it out w alpha coloring
#test_grid.scatter('Age', 'Sodium', group='status', alpha=0.4, s=30)
# on top of that, plot the *actual* points
#plots.scatter(ckd_small.column('Age'), ckd_small.column('Sodium'), c=ckd_small.column('Color'), edgecolor='k')
#plots.xlim(-2, 2)
#plots.ylim(-2, 2); #the boundary is being drawn in the restricted -2,2 space

	Fixed or Improved	Negative Sentiment	word_count	hour	polarity
Response Group
Emergency	0.000000	0.846154	2.153846	14.923077	-2.230769
Weather	0.007407	0.088889	2.918519	17.081481	-0.207407
Student Government	0.050000	0.050000	2.050000	13.250000	-0.125000
Feedback	0.015625	0.093750	1.937500	15.218750	-0.065079
Registrar	0.072993	0.014599	2.547445	14.846715	-0.059848

	Fixed or Improved	Negative Sentiment	word_count	hour	polarity
Response Group
Human	0.024096	0.000000	1.951807	15.891566	1.001205
Global Services	0.100000	0.100000	2.400000	12.250000	0.490000
Alumni	0.000000	0.000000	1.875000	16.750000	0.400000
Miscellaneous	0.048851	0.051724	2.597701	15.189655	0.247414
NYU Traditions	0.013699	0.013699	2.602740	13.739726	0.239726

	Message
Response Type
Fallback	415
Bus B	161
Dining Menus	155
Bus IDK	140
Academics Menu	139

	Negative Sentiment
Response Type
Fallback	100.0
Small Talk	28.0
Emergency	22.0
?Mental Health Resources	15.0
Cost of Attendance	14.0

	Negative Sentiment	polarity
Response Type
Emergency	0.846154	-2.230769
Tutorial	1.000000	-2.100000
?Eligible	0.000000	-1.750000
?Free Flu Shots	0.000000	-1.600000
Rory Meyers College of Nursing Address	1.000000	-1.300000

	Message
Response Group
Public Safety	924
Dining	659
Campus	465
Fallback	415
Housing	389

	Negative Sentiment
Response Group
Student Health Center	28.0
Finances	26.0
Emergency	22.0
Dining	19.0
Miscellaneous	18.0

	Negative Sentiment	polarity
Response Type
Psychology Building Address	0.000000	3.100000
?SSN	0.666667	2.466667
?Cross School Classes	0.000000	2.300000
Alumni Address	0.000000	2.300000
Museum Guide	0.000000	2.300000

	Fixed or Improved	Negative Sentiment	word_count	hour	polarity
Fixed or Improved	1.000000	0.158900	0.085058	0.025404	0.029071
Negative Sentiment	0.158900	1.000000	0.087421	0.019046	-0.229825
word_count	0.085058	0.087421	1.000000	-0.009042	0.178284
hour	0.025404	0.019046	-0.009042	1.000000	0.013061
polarity	0.029071	-0.229825	0.178284	0.013061	1.000000

Dep. Variable:	polarity	R-squared:	0.032
Model:	OLS	Adj. R-squared:	0.032
Method:	Least Squares	F-statistic:	210.2
Date:	Sat, 29 Jul 2023	Prob (F-statistic):	6.82e-47
Time:	13:45:29	Log-Likelihood:	-6615.8
No. Observations:	6405	AIC:	1.324e+04
Df Residuals:	6403	BIC:	1.325e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-0.1258	0.016	-7.777	0.000	-0.158	-0.094
word_count	0.0854	0.006	14.498	0.000	0.074	0.097

Omnibus:	1871.100	Durbin-Watson:	1.781
Prob(Omnibus):	0.000	Jarque-Bera (JB):	62477.716
Skew:	0.744	Prob(JB):	0.00
Kurtosis:	18.228	Cond. No.	5.75

Response Accuracy	Full	None	Partial
Message Type
Button	2937	21	5
Freeform	1649	1366	449