in sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsUtil.java [143:206]
public List<GcsPath> expand(GcsPath gcsPattern) throws IOException {
Preconditions.checkArgument(isGcsPatternSupported(gcsPattern.getObject()));
Matcher m = GLOB_PREFIX.matcher(gcsPattern.getObject());
Pattern p = null;
String prefix = null;
if (!m.matches()) {
// Not a glob.
// Results of GCS storage list feature is only eventually consistent so we should not use that
// feature to check the existence of single files.
return ImmutableList.of(gcsPattern);
} else {
// Part before the first wildcard character.
prefix = m.group("PREFIX");
p = Pattern.compile(globToRegexp(gcsPattern.getObject()));
}
LOG.debug("matching files in bucket {}, prefix {} against pattern {}", gcsPattern.getBucket(),
prefix, p.toString());
// List all objects that start with the prefix (including objects in sub-directories).
Storage.Objects.List listObject = storageClient.objects().list(gcsPattern.getBucket());
listObject.setMaxResults(MAX_LIST_ITEMS_PER_CALL);
listObject.setPrefix(prefix);
String pageToken = null;
List<GcsPath> results = new LinkedList<>();
do {
if (pageToken != null) {
listObject.setPageToken(pageToken);
}
Objects objects;
try {
objects = ResilientOperation.retry(
ResilientOperation.getGoogleRequestCallable(listObject),
new AttemptBoundedExponentialBackOff(3, 200),
RetryDeterminer.SOCKET_ERRORS,
IOException.class);
} catch (Exception e) {
throw new IOException("Unable to match files in bucket " + gcsPattern.getBucket()
+ ", prefix " + prefix + " against pattern " + p.toString(), e);
}
//Objects objects = listObject.execute();
Preconditions.checkNotNull(objects);
if (objects.getItems() == null) {
break;
}
// Filter objects based on the regex.
for (StorageObject o : objects.getItems()) {
String name = o.getName();
// Skip directories, which end with a slash.
if (p.matcher(name).matches() && !name.endsWith("/")) {
LOG.debug("Matched object: {}", name);
results.add(GcsPath.fromObject(o));
}
}
pageToken = objects.getNextPageToken();
} while (pageToken != null);
return results;
}