# What is the distribution of numbers of emails per user? # What is the distribution of emails per folder? # Are the users orgaizing their email into folders? # Are certain folders common to all users? ########################################################################## file.info("~/Downloads/enron_mail_030204.tar.gz") size isdir mode mtime ~/Downloads/enron_mail_030204.tar.gz 3.85e+08 FALSE 644 2007-07-31 07:56:03 ctime atime ~/Downloads/enron_mail_030204.tar.gz 2007-07-31 07:56:03 2008-06-28 13:44:48 uid gid uname grname ~/Downloads/enron_mail_030204.tar.gz 501 501 duncan duncan # Change R's view of the current working directory to the top directory # of all the people's folders. This way we don't need to qualify the queries # with an absolute fully-qualified directory name and we can use "." # But we should abstract this. setwd("~/Downloads/Enron/maildir/") # How many "logins/people/users" are there people = list.files(".") names(people) = people # What is the distribution of emails per user? Within each user, we find all their folders and all their files. Each file is a message. So count the number of files, not directories. findMessages = function(person) { list.files(person, recursive = TRUE) } # number per person numPerPerson = sapply(people, function(p) length(findMessages(p))) # sort by number per person, decreasing sort(numPerPerson, dec = TRUE)[1:50] ######################## # Emails per folder numPerFolder = sapply(people, function(p) table(dirname(findMessages(p)))) So how do we determine the distribution of emails per folder in a meaningful way? summary(unlist(numPerFolder)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.0 3.0 13.0 156.3 72.0 11900.0 hist(unlist(numPerFolder)) # What is the distribution of emails per folder? First off, we can look in each folder under each person. How many directories do we see? Are there any files, i.e. individual messages? lapply(people, function(x) file.info(x)) # Number of folders per person personFolderNames = lapply(list.files("."), list.files) names(personFolderNames) = list.files(".") hist(sapply(personFolderNames, length)) # why not kernel density!!!! summary(sapply(personFolderNames, length)) # So look beyond 3rd quartile names(personFolderNames)[ sapply(personFolderNames, length) > 20.00] [1] "arnold-j" "beck-s" "blair-l" "campbell-l" "cash-m" [6] "dasovich-j" "davis-d" "farmer-d" "germany-c" "griffith-j" [11] "haedicke-m" "kaminski-v" "kean-s" "lavorato-j" "lokay-m" [16] "love-p" "mann-k" "mcconnell-m" "neal-s" "rogers-b" [21] "ruscitti-k" "sanders-r" "scholtes-d" "scott-s" "shackleton-s" [26] "shapiro-r" "steffes-j" "taylor-m" "ward-k" "watson-k" [31] "weldon-c" "whalley-g" "white-s" "williams-w3" "zipper-a" # What are the common folders? Let's get the names of all the folders for each person and put them together to get a frequency table. Since no person can have the same folder within their mail directory, this count will be "across" people. sort(table(unlist(personFolderNames)), dec = TRUE)[1:20] inbox sent_items deleted_items all_documents 137 136 135 110 discussion_threads sent notes_inbox _sent_mail 93 89 82 78 calendar contacts personal tasks 71 46 39 35 to_do eol prc private_folders 34 11 10 9 california canada ees presentations 7 7 7 7 # As a percentage of users sort(table(unlist(personFolderNames)), dec = TRUE)[1:20]/length(people) # Perhaps cut off at 50% of users. tmp = sort(table(unlist(personFolderNames)), dec = TRUE)/length(people) tmp[ tmp > .5] # Are the users orgaizing their email into folders? Perhaps a metric is how many folders they have sort(sapply(personFolderNames, length), decreasing = TRUE) kean-s beck-s shackleton-s shapiro-r watson-k 187 114 111 110 69 blair-l griffith-j mcconnell-m dasovich-j steffes-j 68 68 65 63 54 campbell-l kaminski-v ward-k arnold-j haedicke-m 53 48 48 46 46 sanders-r farmer-d weldon-c germany-c ruscitti-k 40 38 37 35 35 mann-k lavorato-j taylor-m white-s cash-m 34 33 33 31 29 zipper-a williams-w3 davis-d rogers-b lokay-m 26 24 23 23 22 scholtes-d love-p neal-s scott-s whalley-g 22 21 21 21 21 hayslett-r richey-c symes-k ybarbo-p storey-g 20 20 20 20 19 wolfe-j mckay-j mclaughlin-e staab-t badeer-r 19 18 18 17 16 baughman-d shively-h buy-r geaccone-t lay-k 16 16 15 15 15 quigley-d bass-e hernandez-j hyatt-k stclair-c 15 14 14 14 14 sturm-f brawner-s ring-r sager-e semperger-c 14 13 13 13 13 smith-m carson-m corman-s cuilla-m derrick-j 13 12 12 12 12 fischer-m gilbertsmith-d king-j nemec-g schoolcraft-d 12 12 12 12 12 shankman-j arora-h dean-c dorland-c giron-d 12 11 11 11 11 perlingiere-d schwieger-j skilling-j allen-p hodge-j 11 11 11 10 10 jones-t keavey-p lewis-a maggi-m martin-t 10 10 10 10 10 presto-k williams-j benson-r gay-r grigsby-m 10 10 9 9 9 hendrickson-s kuykendall-t may-l tholt-j townsend-j 9 9 9 9 9 tycholiz-b whalley-l delainey-d donohoe-t ermis-f 9 9 8 8 8 horton-s kitchen-l mckay-b mims-thurston-p pereira-s 8 8 8 8 8 pimenov-v quenet-j ring-a rodrique-r saibi-e 8 8 8 8 8 fossum-d lenhart-m lokey-t rapp-b sanchez-m 7 7 7 7 7 south-s heard-m hyvl-d mccarty-d platter-p 7 6 6 6 6 solberg-g thomas-p zufferli-j bailey-s crandell-s 6 6 6 5 5 donoho-l forney-j hain-m keiser-k lucci-p 5 5 5 5 5 panus-s salisbury-h stepenovitch-j swerzbin-m causholli-m 5 5 5 5 4 gang-l guzman-m linder-e merriss-s parks-j 4 4 4 4 4 phanis-s reitmeyer-j dickson-s holst-k meyers-a 4 4 3 3 3 motley-m slinger-r whitt-m harris-s stokley-c 3 3 3 2 1 Which are the ones beyond the 75th quartile numFolders = sapply(personFolderNames, length) sort(numFolders[numFolders > quantile(numFolders, .75)], dec = TRUE) kean-s beck-s shackleton-s shapiro-r watson-k blair-l 187 114 111 110 69 68 griffith-j mcconnell-m dasovich-j steffes-j campbell-l kaminski-v 68 65 63 54 53 48 ward-k arnold-j haedicke-m sanders-r farmer-d weldon-c 48 46 46 40 38 37 germany-c ruscitti-k mann-k lavorato-j taylor-m white-s 35 35 34 33 33 31 cash-m zipper-a williams-w3 davis-d rogers-b lokay-m 29 26 24 23 23 22 scholtes-d love-p neal-s scott-s whalley-g 22 21 21 21 21 These questions only deal with the number of folders and files, not the contents of the files which is far more interesting. We want to look at the to, from, subject, date quadruple from each e-mails message, i.e. the content. How can we read this information: for each file/message, read all the lines up to the first blank lines = readLines("skilling-j/inbox/1007.") header = lines[ 1:(min(which(lines == "")) -1)] h = read.dcf(textConnection(header)) h[, "From"] h[, "To"] h[, "subject"] strptime(h[,"Date"], "%a, %d %b %Y %H:%M:%S -%Z")