in pig/src/main/java/com/twitter/elephantbird/pig/load/LzoBaseRegexLoader.java [48:100]
public Tuple getNext() throws IOException {
if (reader == null) {
return null;
}
Pattern pattern = getPattern();
Matcher matcher = pattern.matcher("");
Object lineObj;
String line;
Tuple t = null;
// Read lines until a match is found, making sure there's no reading past the
// end of the assigned byte range.
try {
while (reader.nextKeyValue()) {
lineObj = reader.getCurrentValue();
if (lineObj == null) {
break;
}
line = lineObj.toString();
matcher = matcher.reset(line);
// Increment counters for the number of matched and unmatched lines.
if (matcher.find()) {
incrCounter(LzoBaseRegexLoaderCounters.MatchedRegexLines, 1L);
t = tupleFactory_.newTuple(matcher.groupCount());
for (int i = 1; i <= matcher.groupCount(); i++) {
if(matcher.group(i) != null) {
t.set(i - 1, matcher.group(i));
} else {
t.set(i - 1, "");
}
}
break;
} else {
incrCounter(LzoBaseRegexLoaderCounters.UnmatchedRegexLines, 1L);
// TODO: stop doing this, as it can slow down the job.
LOG.debug("No match for line " + line);
}
// If the read has walked beyond the end of the split, move on.
}
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
return t;
}