public GraphAndGroundTruth generateWithDisjointClusters()

in src/main/java/com/twitter/sbf/generator/GraphGenerator.java [358:492]


  public GraphAndGroundTruth generateWithDisjointClusters() {
    int[] firstIdsInEachCluster = new int[numClusters];
    int[] lastIdsInEachCluster = new int[numClusters];
    IntSet[] groundTruthClusters = new IntSet[numClusters];

    Random r = this.rng;
    int numNodes = 0;
    for (int i = 0; i < numClusters; i++) {
      int sampledClusterSize = minClusterSize + r.nextInt(maxClusterSize - minClusterSize);
      firstIdsInEachCluster[i] = numNodes;
      lastIdsInEachCluster[i] = numNodes + sampledClusterSize - 1;
      numNodes += sampledClusterSize;
    }

    int[] vertexToClusterId = new int[numNodes];
    groundTruthClusters[0] = new IntOpenHashSet();
    for (int i = 0, clusterId = 0; i < numNodes; i++) {
      if (i > lastIdsInEachCluster[clusterId]) {
        clusterId++;
        groundTruthClusters[clusterId] = new IntOpenHashSet();
      }
      vertexToClusterId[i] = clusterId;
      groundTruthClusters[clusterId].add(i);
      if (i == numNodes - 1) {
        assert clusterId == numClusters - 1;
      }
    }

    ArrayList<HashSet<Integer>> adjLists = new ArrayList<>(numNodes);
    for (int i = 0; i < numNodes; i++) {
      adjLists.add(new HashSet<>());
    }

    int numEdges = 0;
    int numIntraClusterEdges = 0;
    for (int clusterId = 0; clusterId < numClusters; clusterId++) {
      int sizeOfThisCluster =
          lastIdsInEachCluster[clusterId] - firstIdsInEachCluster[clusterId] + 1;
      double probForThisCluster = getProbabilityInsideCluster(sizeOfThisCluster);
      assert probForThisCluster > 0 && probForThisCluster < 1;
      GeometricDistribution gd = new GeometricDistribution(rng, probForThisCluster);

      int numEdgesInsideThisCluster = 0;
      for (int nodeId = firstIdsInEachCluster[clusterId];
           nodeId <= lastIdsInEachCluster[clusterId]; nodeId++) {
        // Starting from nodeId after current node, repeat
        for (int neighborNodePosition = nodeId + 1;
             neighborNodePosition <= lastIdsInEachCluster[clusterId];) {
          int numNodesToSkip = gd.sample();
          neighborNodePosition += numNodesToSkip;
          if (neighborNodePosition <= lastIdsInEachCluster[clusterId]) {
            assert !adjLists.get(nodeId).contains(neighborNodePosition);
            assert !adjLists.get(neighborNodePosition).contains(nodeId);
            adjLists.get(nodeId).add(neighborNodePosition);
            adjLists.get(neighborNodePosition).add(nodeId);
            neighborNodePosition++;
            numEdges++;
            numEdgesInsideThisCluster++;
            numIntraClusterEdges++;
          }
        }
      }

      float actualProbInsideCluster =
          numEdgesInsideThisCluster * 1.0f / ((sizeOfThisCluster * (sizeOfThisCluster - 1)) / 2);
      assert Math.abs(actualProbInsideCluster - probForThisCluster) < 0.1
          : String.format("cluster %d: size %d, assigned prob. %f, actual prob. %f\n",
            clusterId + 1, sizeOfThisCluster, probForThisCluster, actualProbInsideCluster);
    }

    int numGlobalEdges = 0;
    GeometricDistribution gd = new GeometricDistribution(rng, getGlobalProbability());
    for (int nodeId = 0; nodeId < numNodes; nodeId++) {
      for (int neighborNodePosition = nodeId + 1; neighborNodePosition < numNodes;) {
        int numNodesToSkip = gd.sample();
        neighborNodePosition += numNodesToSkip;
        if (neighborNodePosition < numNodes) {
          if (!adjLists.get(nodeId).contains(neighborNodePosition)) {
            adjLists.get(nodeId).add(neighborNodePosition);
            adjLists.get(neighborNodePosition).add(nodeId);
            neighborNodePosition++;
            numEdges++;
            numGlobalEdges++;
          }
        }
      }
    }

    int[][] nbrs = new int[numNodes][];
    int numEdges2 = 0;
    for (int i = 0; i < numNodes; i++) {
      nbrs[i] = new int[adjLists.get(i).size()];
      int j = 0;
      HashSet<Integer> s = adjLists.get(i);
      for (int neighbor : s) {
        nbrs[i][j] = neighbor;
        j++;
        numEdges2++;
      }
      Arrays.sort(nbrs[i]);
    }

    float[][] wts = null;
    if (this.isWeighted) {
      wts = new float[numNodes][];
      for (int i = 0; i < numNodes; i++) {
        wts[i] = new float[nbrs[i].length];
        for (int j = 0; j < wts[i].length; j++) {
          // Consider only cells in the lower-triangular part of the adjacency matrix, and then
          // replicate the weight to the corresponding cell in the upper-triangular part.
          if (nbrs[i][j] < i) {
            // for inter-cluster edges, set their weight to be low
            float newWt = 0;
            if (vertexToClusterId[i] != vertexToClusterId[nbrs[i][j]]) {
              newWt = (float) (lowerWeightMode - 1.5 * lowerWeightDist.getStandardDeviation());
            } else {
              newWt = (float) sampleEdgeWeight();
            }
            wts[i][j] = newWt;
            int indexOfIInJ = Arrays.binarySearch(nbrs[nbrs[i][j]], i);
            wts[nbrs[i][j]][indexOfIInJ] = newWt;
          }
        }
      }
    }

    System.err.format("Num intra-cluster edges: %d, num global edges: %d, "
            + "fraction global edges: %f\n",
        numIntraClusterEdges, numGlobalEdges, numGlobalEdges * 1.0f / numEdges);

    assert numEdges * 2 == numEdges2
        : String.format("numEdges %d, numEdges2 %d", numEdges, numEdges2);

    return new GraphAndGroundTruth(new Graph(numNodes, numEdges, nbrs, wts), groundTruthClusters);
  }