def build_app()

in ai-ml/gke-ray/rayserve/llm/tpu/serve_tpu.py [0:0]


def build_app(cli_args: Dict[str, str]) -> serve.Application:
    """Builds the Serve app based on CLI arguments."""
    ray.init(ignore_reinit_error=True, address="ray://localhost:10001")

    model_id = os.environ['MODEL_ID']

    num_tpu_chips = get_num_tpu_chips()
    pg_resources = []
    pg_resources.append({"CPU": 1})  # for the deployment replica
    for i in range(num_tpu_chips):
        pg_resources.append({"CPU": 1, "TPU": 1})  # for the vLLM actors

    # Use PACK strategy since the deployment may use more than one TPU node.
    return VLLMDeployment.options(
        placement_group_bundles=pg_resources,
        placement_group_strategy="PACK").bind(model_id, num_tpu_chips, get_max_model_len(), get_tokenizer_mode(), get_dtype())