def build_app()

in ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py [0:0]


def build_app(cli_args: Dict[str, str]) -> serve.Application:
    """Builds the Serve app based on CLI arguments."""
    ray.init(ignore_reinit_error=True)

    num_tpu_chips = get_num_tpu_chips()
    tpu_head = get_tpu_head()
    tpu_slices = 1
    if tpu_head is not None:
        tpu_slices = ray.cluster_resources()[tpu_head]
    num_tpu_chips_per_slice = int(num_tpu_chips/tpu_slices)
    # Construct a placement group for 1 TPU slice. Each model should run on its own slice.
    pg_resources = []
    pg_resources.append({"CPU": 1})  # for the deployment replica
    for i in range(num_tpu_chips_per_slice):
        pg_resources.append({"CPU": 1, "TPU": 1})  # for the vLLM actors
    # Add a TPU head to the placement group to ensure Ray workers are not placed across slices.
    pg_resources.append({tpu_head: 1})

    return MultiModelDeployment.bind(
        VLLMDeployment.options(
            placement_group_bundles=pg_resources,
            placement_group_strategy="PACK").bind(
            model=os.environ['ASSIST_MODEL_ID'],
            tensor_parallel_size=num_tpu_chips_per_slice,
            enforce_eager=True,
        ),
        VLLMSummarizerDeployment.options(
            placement_group_bundles=pg_resources,
            placement_group_strategy="PACK").bind(
            model=os.environ['SUMMARIZER_MODEL_ID'],
            tensor_parallel_size=num_tpu_chips_per_slice,
            enforce_eager=True,
        ),
    )