# What is the distribution of numbers of emails per user?

# What is the distribution of emails per folder?

# Are the users orgaizing their email into folders?

# Are certain folders common to all users?

##########################################################################

file.info("~/Downloads/enron_mail_030204.tar.gz") 
                                         size isdir mode               mtime
~/Downloads/enron_mail_030204.tar.gz 3.85e+08 FALSE  644 2007-07-31 07:56:03
                                                   ctime               atime
~/Downloads/enron_mail_030204.tar.gz 2007-07-31 07:56:03 2008-06-28 13:44:48
                                     uid gid  uname grname
~/Downloads/enron_mail_030204.tar.gz 501 501 duncan duncan


# Change R's view of the current working directory to the top directory
# of all the people's folders. This way we don't need to qualify the queries
# with an absolute fully-qualified directory name and we can use "."
# But we should abstract this.

setwd("~/Downloads/Enron/maildir/")


# How many "logins/people/users" are there 
people = list.files(".")
names(people) = people


# What is the distribution of emails per user?

Within each user, we find all their folders and all their files.
Each file is a message. So count the number of files, not directories.

findMessages =
function(person)
{
   list.files(person, recursive = TRUE)   
}

   # number per person
numPerPerson = sapply(people, function(p) length(findMessages(p)))
   # sort by number per person, decreasing
sort(numPerPerson, dec = TRUE)[1:50]


########################

# Emails per folder

numPerFolder = sapply(people,
                      function(p)
                        table(dirname(findMessages(p))))

So how do we determine the distribution of emails per folder in a meaningful
way?

summary(unlist(numPerFolder))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0     3.0    13.0   156.3    72.0 11900.0 

hist(unlist(numPerFolder))  


# What is the distribution of emails per folder?

First off, we can look in each folder under each person.
How many directories do we see?
Are there any files, i.e. individual messages?

lapply(people, function(x) file.info(x))


#  Number of folders per person
personFolderNames = lapply(list.files("."), list.files)
names(personFolderNames) = list.files(".")
hist(sapply(personFolderNames, length))  # why not kernel density!!!!

summary(sapply(personFolderNames, length))
# So look beyond 3rd quartile
names(personFolderNames)[ sapply(personFolderNames, length) > 20.00]
 [1] "arnold-j"     "beck-s"       "blair-l"      "campbell-l"   "cash-m"      
 [6] "dasovich-j"   "davis-d"      "farmer-d"     "germany-c"    "griffith-j"  
[11] "haedicke-m"   "kaminski-v"   "kean-s"       "lavorato-j"   "lokay-m"     
[16] "love-p"       "mann-k"       "mcconnell-m"  "neal-s"       "rogers-b"    
[21] "ruscitti-k"   "sanders-r"    "scholtes-d"   "scott-s"      "shackleton-s"
[26] "shapiro-r"    "steffes-j"    "taylor-m"     "ward-k"       "watson-k"    
[31] "weldon-c"     "whalley-g"    "white-s"      "williams-w3"  "zipper-a"


# What are the common folders?

Let's get  the names of all the folders for each person and put them together
to get a frequency table.  Since no person can have the same folder within their
mail directory, this count will be "across" people.


sort(table(unlist(personFolderNames)), dec = TRUE)[1:20]

             inbox         sent_items      deleted_items      all_documents 
               137                136                135                110 
discussion_threads               sent        notes_inbox         _sent_mail 
                93                 89                 82                 78 
          calendar           contacts           personal              tasks 
                71                 46                 39                 35 
             to_do                eol                prc    private_folders 
                34                 11                 10                  9 
        california             canada                ees      presentations 
                 7                  7                  7                  7

# As a percentage of users
sort(table(unlist(personFolderNames)), dec = TRUE)[1:20]/length(people)

# Perhaps cut off at 50% of users.
tmp = sort(table(unlist(personFolderNames)), dec = TRUE)/length(people)
tmp[ tmp > .5]


# Are the users orgaizing their email into folders?

Perhaps a metric is how many folders they have

sort(sapply(personFolderNames, length), decreasing = TRUE)
         kean-s          beck-s    shackleton-s       shapiro-r        watson-k 
            187             114             111             110              69 
        blair-l      griffith-j     mcconnell-m      dasovich-j       steffes-j 
             68              68              65              63              54 
     campbell-l      kaminski-v          ward-k        arnold-j      haedicke-m 
             53              48              48              46              46 
      sanders-r        farmer-d        weldon-c       germany-c      ruscitti-k 
             40              38              37              35              35 
         mann-k      lavorato-j        taylor-m         white-s          cash-m 
             34              33              33              31              29 
       zipper-a     williams-w3         davis-d        rogers-b         lokay-m 
             26              24              23              23              22 
     scholtes-d          love-p          neal-s         scott-s       whalley-g 
             22              21              21              21              21 
     hayslett-r        richey-c         symes-k        ybarbo-p        storey-g 
             20              20              20              20              19 
        wolfe-j         mckay-j    mclaughlin-e         staab-t        badeer-r 
             19              18              18              17              16 
     baughman-d       shively-h           buy-r      geaccone-t           lay-k 
             16              16              15              15              15 
      quigley-d          bass-e     hernandez-j         hyatt-k       stclair-c 
             15              14              14              14              14 
        sturm-f       brawner-s          ring-r         sager-e     semperger-c 
             14              13              13              13              13 
        smith-m        carson-m        corman-s        cuilla-m       derrick-j 
             13              12              12              12              12 
      fischer-m  gilbertsmith-d          king-j         nemec-g   schoolcraft-d 
             12              12              12              12              12 
     shankman-j         arora-h          dean-c       dorland-c         giron-d 
             12              11              11              11              11 
  perlingiere-d     schwieger-j      skilling-j         allen-p         hodge-j 
             11              11              11              10              10 
        jones-t        keavey-p         lewis-a         maggi-m        martin-t 
             10              10              10              10              10 
       presto-k      williams-j        benson-r           gay-r       grigsby-m 
             10              10               9               9               9 
  hendrickson-s    kuykendall-t           may-l         tholt-j      townsend-j 
              9               9               9               9               9 
     tycholiz-b       whalley-l      delainey-d       donohoe-t         ermis-f 
              9               9               8               8               8 
       horton-s       kitchen-l         mckay-b mims-thurston-p       pereira-s 
              8               8               8               8               8 
      pimenov-v        quenet-j          ring-a      rodrique-r         saibi-e 
              8               8               8               8               8 
       fossum-d       lenhart-m         lokey-t          rapp-b       sanchez-m 
              7               7               7               7               7 
        south-s         heard-m          hyvl-d       mccarty-d       platter-p 
              7               6               6               6               6 
      solberg-g        thomas-p      zufferli-j        bailey-s      crandell-s 
              6               6               6               5               5 
       donoho-l        forney-j          hain-m        keiser-k         lucci-p 
              5               5               5               5               5 
        panus-s     salisbury-h  stepenovitch-j      swerzbin-m     causholli-m 
              5               5               5               5               4 
         gang-l        guzman-m        linder-e       merriss-s         parks-j 
              4               4               4               4               4 
       phanis-s     reitmeyer-j       dickson-s         holst-k        meyers-a 
              4               4               3               3               3 
       motley-m       slinger-r         whitt-m        harris-s       stokley-c 
              3               3               3               2               1 


Which are the ones beyond the 75th quartile
numFolders = sapply(personFolderNames, length)

sort(numFolders[numFolders > quantile(numFolders, .75)], dec = TRUE)
      kean-s       beck-s shackleton-s    shapiro-r     watson-k      blair-l 
         187          114          111          110           69           68 
  griffith-j  mcconnell-m   dasovich-j    steffes-j   campbell-l   kaminski-v 
          68           65           63           54           53           48 
      ward-k     arnold-j   haedicke-m    sanders-r     farmer-d     weldon-c 
          48           46           46           40           38           37 
   germany-c   ruscitti-k       mann-k   lavorato-j     taylor-m      white-s 
          35           35           34           33           33           31 
      cash-m     zipper-a  williams-w3      davis-d     rogers-b      lokay-m 
          29           26           24           23           23           22 
  scholtes-d       love-p       neal-s      scott-s    whalley-g 
          22           21           21           21           21 


These questions only deal with the number of folders and files, not the contents
of the files which is far more interesting.  We want to look at the
  to, from, subject, date
quadruple from each e-mails message, i.e. the content.

How can we read this information:

  for each file/message, read all the lines up to the first blank

lines = readLines("skilling-j/inbox/1007.")
header = lines[ 1:(min(which(lines == "")) -1)]

h = read.dcf(textConnection(header))
h[, "From"]
h[, "To"]
h[, "subject"]
strptime(h[,"Date"], "%a, %d %b %Y %H:%M:%S -%Z")