in rcfile/src/main/java/com/twitter/elephantbird/util/RCFileUtil.java [82:144]
public static ArrayList<Integer> findColumnsToRead(
Configuration conf,
List<Integer> currFieldIds,
ColumnarMetadata storedInfo)
throws IOException {
ArrayList<Integer> columnsToRead = Lists.newArrayList();
// first find the required fields
ArrayList<Integer> requiredFieldIds = Lists.newArrayList();
String reqFieldStr = conf.get(RCFileUtil.REQUIRED_FIELD_INDICES_CONF, "");
int numKnownFields = currFieldIds.size();
if (reqFieldStr == null || reqFieldStr.equals("")) {
for(int i=0; i<numKnownFields; i++) {
requiredFieldIds.add(currFieldIds.get(i));
}
} else {
for (String str : reqFieldStr.split(",")) {
int idx = Integer.valueOf(str);
if (idx < 0 || idx >= numKnownFields) {
throw new IOException("idx " + idx + " is out of range for known fields");
}
requiredFieldIds.add(currFieldIds.get(idx));
}
}
List<Integer> storedFieldIds = storedInfo.getFieldIdList();
for(int i=0; i < storedFieldIds.size(); i++) {
int sid = storedFieldIds.get(i);
if (sid > 0 && requiredFieldIds.contains(sid)) {
columnsToRead.add(i);
}
}
// unknown fields : the required fields that are not listed in storedFieldIds
String unknownFields = "";
for(int rid : requiredFieldIds) {
if (!storedFieldIds.contains(rid)) {
unknownFields += " " + rid;
}
}
if (unknownFields.length() > 0) {
int last = storedFieldIds.size() - 1;
LOG.info("unknown fields among required fileds :" + unknownFields);
if (storedFieldIds.get(last) != -1) { // not expected
throw new IOException("No unknowns column in in input");
}
columnsToRead.add(last);
}
LOG.info(String.format(
"reading %d%s out of %d stored columns for %d required columns",
columnsToRead.size(),
(unknownFields.length() > 0 ? " (including unknowns column)" : ""),
storedInfo.getFieldIdList().size(),
requiredFieldIds.size()));
return columnsToRead;
}