in ai-ml/gke-ray/rayserve/llm/tpu/serve_tpu.py [0:0]
def build_app(cli_args: Dict[str, str]) -> serve.Application:
"""Builds the Serve app based on CLI arguments."""
ray.init(ignore_reinit_error=True, address="ray://localhost:10001")
model_id = os.environ['MODEL_ID']
num_tpu_chips = get_num_tpu_chips()
pg_resources = []
pg_resources.append({"CPU": 1}) # for the deployment replica
for i in range(num_tpu_chips):
pg_resources.append({"CPU": 1, "TPU": 1}) # for the vLLM actors
# Use PACK strategy since the deployment may use more than one TPU node.
return VLLMDeployment.options(
placement_group_bundles=pg_resources,
placement_group_strategy="PACK").bind(model_id, num_tpu_chips, get_max_model_len(), get_tokenizer_mode(), get_dtype())