dazzling-rocket-38979
09/13/2024, 5:05 AMresource "google_service_account" "metaflow_kubernetes_control_plane_service_account" {
provider = google-beta
# TODO fix names (e.g. gsa would be nice)
# gsa-metaflow-k8s-ctrl-<workspace>
account_id = "sa-mf-k8s-${terraform.workspace}"
display_name = "Service Account for Kubernetes Control Plane (${terraform.workspace})"
}
resource "google_container_cluster" "metaflow_kubernetes" {
provider = google-beta
name = var.kubernetes_cluster_name
initial_node_count = 1
location = var.zone
workload_identity_config {
workload_pool = "${var.project}.svc.id.goog"
}
node_config {
# Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles.
service_account = google_service_account.metaflow_kubernetes_control_plane_service_account.email
oauth_scopes = [
"<https://www.googleapis.com/auth/cloud-platform>"
]
workload_metadata_config {
mode = "GKE_METADATA"
}
}
cluster_autoscaling {
enabled = true
resource_limits {
resource_type = "cpu"
minimum = 1
maximum = 200
}
resource_limits {
resource_type = "memory"
minimum = 2
maximum = 400
}
}
network = google_compute_network.metaflow_compute_network.name
subnetwork = google_compute_subnetwork.metaflow_subnet_for_kubernetes.name
networking_mode = "VPC_NATIVE"
# empty block is required
ip_allocation_policy {}
}
# GPU node pool for GPU-based workloads
resource "google_container_node_pool" "gpu_pool" {
provider = google-beta
cluster = google_container_cluster.metaflow_kubernetes.name
location = var.zone
name = "gpu-node-pool"
initial_node_count = 1
node_config {
machine_type = "n1-standard-8" # adjust machine type based on GPU requirements
disk_size_gb = 600
service_account = google_service_account.metaflow_kubernetes_control_plane_service_account.email
oauth_scopes = [
"<https://www.googleapis.com/auth/cloud-platform>"
]
workload_metadata_config {
mode = "GKE_METADATA"
}
metadata = {
"disable-legacy-endpoints" = "true"
}
guest_accelerator {
type = "nvidia-tesla-t4" # Specify the type of GPU (e.g., nvidia-tesla-v100)
count = 1 # Number of GPUs per node, adjust as needed
}
}
autoscaling {
min_node_count = 1
max_node_count = 10
}
management {
auto_upgrade = true
auto_repair = true
}
upgrade_settings {
max_surge = 1
max_unavailable = 0
}
}
But for some reason, when I use this gpu_pool
it cannot pull the target_ image that I am trying to use.
from metaflow import FlowSpec, step, kubernetes
# Define a Metaflow flow
target_image = 'us-west1-docker.pkg.dev/project/image-name/image-name:latest'
class TestRun(FlowSpec):
@kubernetes(image=target_image)
@step
def start(self):
print('start')
self.next(self.end)
@step
def end(self):
print('done')
if __name__ == "__main__":
TestRun()
Error message: Back-off pulling image "<http://us-west1-docker.pkg.dev/project/image-name/image-name:latest|us-west1-docker.pkg.dev/project/image-name/image-name:latest>": ImagePullBackOff
which only happens to the GPU pool
Any idea what the issue is? Any help is appreciated!dazzling-rocket-38979
09/13/2024, 4:26 PMresource "google_project_iam_binding" "gcr_reader" {
project = var.project
role = "roles/storage.objectViewer"
members = [
"serviceAccount:${google_service_account.metaflow_kubernetes_control_plane_service_account.email}"
]
}
resource "google_project_iam_binding" "artifact_registry_reader" {
project = var.project
role = "roles/artifactregistry.reader"
members = [
"serviceAccount:${google_service_account.metaflow_kubernetes_control_plane_service_account.email}"
]
}