fileName = "~/Projects/ComputingCurriculum/Workshop2/IO/Longitudinal/longitudinal" con = file(fileName, "r") # Read the header general = read.table(con, sep = ",", nrows = 1) general values = read.table(con, na.strings = ".", nrows = 2 ) t(values) close(con) # So we can loop over the records and extract this information one # observational unit at a time. # Then we can stack the results if we want, but risk issues of memory # growth by concatenation. V1 = numeric() V2 = numeric() id = integer() recordNum = 1 con = file(fileName, "r") while(TRUE) { readLines(con, n = 1) vals = scan(con, nlines = 1, what = "numeric") V1 <- c(V1, vals) V2 <- c(V2, scan(con, nlines = 1, what = "numeric")) id <- c(id, rep(recordNum, length(vals))) recordNum <- recordNum + 1 } close(con) ################################################################################ # An alternative approach is to read the entire file contents # into memory, and the extract the personal details # and then the values by dealing with the lines for all people # as entire blocks, i.e. not person by person. # This almost works! # No connections needs since atomic act of reading all the content lines = readLines(fileName) # Discard the last line lines = lines[lines != ""] numPeople = length(lines)/3 # We know each record has 3 lines: personal details and 2 value lines # So let's get all the personal details first # These are in lines 1, 4, 7, 10, ... i = seq(1, length = numPeople, by = 3) tmp = textConnection(lines[i]) personal.details = read.table(tmp, sep = ",", strip.white = TRUE) close(con) # Now for the values # which are in the other lines, the complement of those indexed by i tmp = textConnection(lines[ -i]) values = read.table(tmp, na.strings = ".", fill = TRUE) close(tmp) j = seq(1, by = 2, length = numPeople) vals = data.frame(V1 = unlist(values[j, ]), V2 = unlist(values[j + 1, ]), id = factor(rep(1:numPeople, ncol(values)), labels = as.character(personal.details[,3]))) vals = vals[order(vals$id), ] # gl - generate factor levels # gl(numPeople, 5, labels = as.character(personal.details[,3]))) # Same as # rep(1:numPeople, rep(ncol(values), numPeople)) # Problem is that we have too many # observations for the second record # The last set of missing values is to many. # So we can try to fix this or use a different approach. # # Each variable separate. a = as.data.frame( lapply(1:2, function(which) { tmp = textConnection(lines[i + which]) on.exit(close(tmp)) nvals = list(V1 = scan(tmp, na.strings = ".")) })) names(a) = c("V1", "V2") # But now we need to know how to match observations # to people. So we need to know the number of observations # per person. numObs = sapply(lines[i + 1], function(x) length(scan(textConnection(x), na.strings = "."))) a$id = factor(rep(1:numPeople, numObs), labels = as.character(personal.details[,3]))) # Alternatively, we could use count.fields, but that doesn't # have a na.strings parameter. if(FALSE) { tmp = textConnection(lines[i + 1]) #XXX Fails! num = count.fields(con, na.strings = ".") close(tmp) } ### # Read all the lines, and then just work on the values # by themselves. # Process all the value lines separately # and keep them separate so that we know how many observations # we have for each individual. tmp = lapply(lines[-i], function(x) { tmp = textConnection(x) on.exit(tmp) scan(tmp, na.strings = ".") }) # 1, 3, 5, ... j = seq(1, by = 2, length = numPeople) numObsPerPerson = sapply(tmp[j], length) vals = data.frame(V1 = unlist(tmp[j]), V2 = unlist(tmp[j + 1]), id = factor(rep(1:numPeople, numObsPerPerson), labels = as.character(personal.details[,3]))) #############################################