in ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py [0:0]
def build_app(cli_args: Dict[str, str]) -> serve.Application:
"""Builds the Serve app based on CLI arguments."""
ray.init(ignore_reinit_error=True)
num_tpu_chips = get_num_tpu_chips()
tpu_head = get_tpu_head()
tpu_slices = 1
if tpu_head is not None:
tpu_slices = ray.cluster_resources()[tpu_head]
num_tpu_chips_per_slice = int(num_tpu_chips/tpu_slices)
# Construct a placement group for 1 TPU slice. Each model should run on its own slice.
pg_resources = []
pg_resources.append({"CPU": 1}) # for the deployment replica
for i in range(num_tpu_chips_per_slice):
pg_resources.append({"CPU": 1, "TPU": 1}) # for the vLLM actors
# Add a TPU head to the placement group to ensure Ray workers are not placed across slices.
pg_resources.append({tpu_head: 1})
return MultiModelDeployment.bind(
VLLMDeployment.options(
placement_group_bundles=pg_resources,
placement_group_strategy="PACK").bind(
model=os.environ['ASSIST_MODEL_ID'],
tensor_parallel_size=num_tpu_chips_per_slice,
enforce_eager=True,
),
VLLMSummarizerDeployment.options(
placement_group_bundles=pg_resources,
placement_group_strategy="PACK").bind(
model=os.environ['SUMMARIZER_MODEL_ID'],
tensor_parallel_size=num_tpu_chips_per_slice,
enforce_eager=True,
),
)