In [1]:
import pandas as pd
import numpy as np
In [2]:
# read in data and subset only read inbox messages
df = pd.read_csv("gmailcontent.csv") # read data
df = df[df['labels'] == "Inbox"]
print df.shape
(11060, 8)
In [3]:
# fix the dates
data = df[df['date'].isnull() == False]
good = []
for _, i in data.iterrows():
    try:
        thedate = pd.to_datetime(i['date'])
        good.append(thedate)
    except:
        place = i['date'].replace(':', 'X', 1).find(':')
        thedate = i['date'][:place+3]
        pd.to_datetime(thedate)
        good.append(thedate)
data['newdate'] = good
print data['newdate'].iloc[0]
print data['newdate'].min()
2016-06-14 19:16:32
2007-01-08 01:20:17
In [4]:
# if message contains no body, drop it
data.dropna(subset = ['body'], inplace=True)
print data['body'].isnull().sum()
0
In [5]:
# define function to clean urls
def getdomain(s):
    spot = s.find('@')+1
    sender = s[spot:].replace('>', '')
    cleandom = sender.strip()
    return cleandom
In [6]:
# define most common domain names
commons = ['gmail.com', 'yahoo.com', 'aol.com', 'hotmail.com', 'us.af.mil']
# parse out domain name and drop most common domains
data['url'] = data['from'].apply(lambda x: getdomain(x))
data = data[-data['url'].isin(commons)]
print data['url'].unique()[:3]
['linkedin.com' 'audiobooks.com' 'medium.com']
In [7]:
# define function to count emails from most frequent senders
def mostread(date = pd.to_datetime('2016-09-01 00:00:00'), end = 0, start = 1, top = 10):
    # define date range
    enddate = date - pd.DateOffset(years=end)
    startdate = date - pd.DateOffset(years=start)
    # subset dataframe between date range
    dataset = data[data['newdate'] > startdate]
    dataset = dataset[dataset['newdate'] < enddate]
    # count top x most frequent domains
    counts = dataset['url'].value_counts()[:top]
    
    return counts
In [11]:
print mostread(end = 0, start = 1, top = 20)
plus.google.com                    75
linkedin.com                       73
post.oreilly.com                   35
udacity.com                        30
quora.com                          25
slack.com                          22
paypal.com" <service@paypal.com    16
medium.com                         15
datascienceweekly.org              15
glassdoor.com                      15
dreamhost.com                      12
kaggle.com                         12
coursera.org                       10
mail.GovTrack.us                    9
runsignup.com                       9
google.com                          8
datasciencecentral.com              8
wordpress.com                       7
mail.vresp.com                      7
codeschool.com                      7
Name: url, dtype: int64
In [13]:
from collections import Counter
from nltk.corpus import stopwords
# for word & bigram analysis define list of words to NOT count
frankwords = ['re:', 'fwd:', 'fw:', '-', 'wrote', 'pm', 'am', '>', '>>', '>>>', '>>>>', '>>>>>',\
              '>>>>>>', '>>>>>>>', '*', '**', 'frank', 'corrigan', '2011', '2013', '2014', '2016', '2015',\
              '------------------------------------------------------------',\
              'get', "i'm", '+000', 'sent', '&', '|', ';', ')', '-----', '(', 'email', 'view', 'us',
              '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~', 'please',
              'click', 'free', '/*', '\xe2\x80\xa2', '*/', '(m', '_____', '...', '+', '</td>', '</tr>' \
              'bg']
stops = [unicode(word) for word in stopwords.words('english')] + frankwords
In [14]:
def commonwords(date = pd.to_datetime('2016-09-01 00:00:00'), end = 0, start = 1, top = 10, part = 'subject'):
    # define date range
    enddate = date - pd.DateOffset(years=end)
    startdate = date - pd.DateOffset(years=start)
    # subset dataframe between date range
    dataset = data[data['newdate'] > startdate]
    dataset = dataset[dataset['newdate'] < enddate] 
    if part == 'subject':
        data.dropna(subset = ['subject'], inplace=True)
    # create bag of words and count list
    subject_word_bag = dataset[part].apply(lambda t: t.lower() + " ").sum()
    subject_words = [word for word in subject_word_bag.split() if word.lower() not in stops]
    
    # return x most frequently occuring words
    return Counter(subject_words).most_common()[:top]
In [17]:
import warnings
warnings.filterwarnings('ignore')
commonwords(end = 0, start = 1, top = 10, part = 'subject')
Out[17]:
[('data', 158),
 ('analyst', 83),
 ("[udacity's", 60),
 ('nanodegree]', 60),
 ('new', 42),
 ('udacity', 23),
 ('hi', 23),
 ('science', 22),
 ('project', 17),
 ('payment', 17)]
In [18]:
from nltk import collocations
def getbigrams(date = pd.to_datetime('2016-09-01 00:00:00'), end = 0, start = 1, top = 10, part = 'subject'):
    # define date range
    enddate = date - pd.DateOffset(years=end)
    startdate = date - pd.DateOffset(years=start)
    # subset dataframe between date range
    dataset = data[data['newdate'] > startdate]
    dataset = dataset[dataset['newdate'] < enddate] 
    if part == 'subject':
        data.dropna(subset = ['subject'], inplace=True)
    # create bag of words and count list
    subject_word_bag = dataset[part].apply(lambda t: t.lower() + " ").sum()
    subject_words = [word for word in subject_word_bag.split() if word.lower() not in stops]
    # use nltk to measure bigram associations
    bigram_measures = collocations.BigramAssocMeasures()
    bigram_finder = collocations.BigramCollocationFinder.from_words(subject_words)
    # Filter to top x results; otherwise this will take a LONG time to analyze
    bigram_finder.apply_freq_filter(20)
    
    return bigram_finder.score_ngrams(bigram_measures.raw_freq)[:top] # top must be <= 20
In [19]:
getbigrams(end = 0, start = 3, top=20, part = 'body')
Out[19]:
[(('data', 'science'), 0.0017113140105257248),
 (('bg', 'bg'), 0.0013495052160767866),
 (('mountain', 'view,'), 0.0008504586030437685),
 (('view,', 'ca'), 0.0008296649941673928),
 (('read', 'more:'), 0.0007963952199651915),
 (('big', 'data'), 0.0007340143933360642),
 (('rights', 'reserved.'), 0.0007236175888978764),
 (('data', 'analyst'), 0.0006113321009654473),
 (('new', 'york'), 0.0005739036049879709),
 (("o'reilly", 'media,'), 0.0005739036049879709),
 (('copyright', '(c)'), 0.0005572687178868703),
 (('machine', 'learning'), 0.00054895127433632),
 (('reply', 'directly'), 0.0005468719134486825),
 (('new', 'york,'), 0.0005052846956959309),
 (('data', 'scientist'), 0.0004782530041566424),
 (('data', 'visualization'), 0.00047617364326900483),
 (('visit', 'support'), 0.00047409428238136725),
 (('4', 'et'), 0.00047201492149372967),
 (('@', '4'), 0.00047201492149372967),
 (('briefing', 'room'), 0.0004636974779431794)]