import pandas as pd
import numpy as np
# read in data and subset only read inbox messages
df = pd.read_csv("gmailcontent.csv") # read data
df = df[df['labels'] == "Inbox"]
print df.shape
# fix the dates
data = df[df['date'].isnull() == False]
good = []
for _, i in data.iterrows():
try:
thedate = pd.to_datetime(i['date'])
good.append(thedate)
except:
place = i['date'].replace(':', 'X', 1).find(':')
thedate = i['date'][:place+3]
pd.to_datetime(thedate)
good.append(thedate)
data['newdate'] = good
print data['newdate'].iloc[0]
print data['newdate'].min()
# if message contains no body, drop it
data.dropna(subset = ['body'], inplace=True)
print data['body'].isnull().sum()
# define function to clean urls
def getdomain(s):
spot = s.find('@')+1
sender = s[spot:].replace('>', '')
cleandom = sender.strip()
return cleandom
# define most common domain names
commons = ['gmail.com', 'yahoo.com', 'aol.com', 'hotmail.com', 'us.af.mil']
# parse out domain name and drop most common domains
data['url'] = data['from'].apply(lambda x: getdomain(x))
data = data[-data['url'].isin(commons)]
print data['url'].unique()[:3]
# define function to count emails from most frequent senders
def mostread(date = pd.to_datetime('2016-09-01 00:00:00'), end = 0, start = 1, top = 10):
# define date range
enddate = date - pd.DateOffset(years=end)
startdate = date - pd.DateOffset(years=start)
# subset dataframe between date range
dataset = data[data['newdate'] > startdate]
dataset = dataset[dataset['newdate'] < enddate]
# count top x most frequent domains
counts = dataset['url'].value_counts()[:top]
return counts
print mostread(end = 0, start = 1, top = 20)
from collections import Counter
from nltk.corpus import stopwords
# for word & bigram analysis define list of words to NOT count
frankwords = ['re:', 'fwd:', 'fw:', '-', 'wrote', 'pm', 'am', '>', '>>', '>>>', '>>>>', '>>>>>',\
'>>>>>>', '>>>>>>>', '*', '**', 'frank', 'corrigan', '2011', '2013', '2014', '2016', '2015',\
'------------------------------------------------------------',\
'get', "i'm", '+000', 'sent', '&', '|', ';', ')', '-----', '(', 'email', 'view', 'us',
'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~', 'please',
'click', 'free', '/*', '\xe2\x80\xa2', '*/', '(m', '_____', '...', '+', '</td>', '</tr>' \
'bg']
stops = [unicode(word) for word in stopwords.words('english')] + frankwords
def commonwords(date = pd.to_datetime('2016-09-01 00:00:00'), end = 0, start = 1, top = 10, part = 'subject'):
# define date range
enddate = date - pd.DateOffset(years=end)
startdate = date - pd.DateOffset(years=start)
# subset dataframe between date range
dataset = data[data['newdate'] > startdate]
dataset = dataset[dataset['newdate'] < enddate]
if part == 'subject':
data.dropna(subset = ['subject'], inplace=True)
# create bag of words and count list
subject_word_bag = dataset[part].apply(lambda t: t.lower() + " ").sum()
subject_words = [word for word in subject_word_bag.split() if word.lower() not in stops]
# return x most frequently occuring words
return Counter(subject_words).most_common()[:top]
import warnings
warnings.filterwarnings('ignore')
commonwords(end = 0, start = 1, top = 10, part = 'subject')
from nltk import collocations
def getbigrams(date = pd.to_datetime('2016-09-01 00:00:00'), end = 0, start = 1, top = 10, part = 'subject'):
# define date range
enddate = date - pd.DateOffset(years=end)
startdate = date - pd.DateOffset(years=start)
# subset dataframe between date range
dataset = data[data['newdate'] > startdate]
dataset = dataset[dataset['newdate'] < enddate]
if part == 'subject':
data.dropna(subset = ['subject'], inplace=True)
# create bag of words and count list
subject_word_bag = dataset[part].apply(lambda t: t.lower() + " ").sum()
subject_words = [word for word in subject_word_bag.split() if word.lower() not in stops]
# use nltk to measure bigram associations
bigram_measures = collocations.BigramAssocMeasures()
bigram_finder = collocations.BigramCollocationFinder.from_words(subject_words)
# Filter to top x results; otherwise this will take a LONG time to analyze
bigram_finder.apply_freq_filter(20)
return bigram_finder.score_ngrams(bigram_measures.raw_freq)[:top] # top must be <= 20
getbigrams(end = 0, start = 3, top=20, part = 'body')