From 08ea52bcf5d6e3436396ab6707c0bc4b5bd943e8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 17 Jan 2025 22:47:25 -0800 Subject: [PATCH] fix tune api error Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 25 ++++++++++++++++--- sdk/python/v1beta1/setup.py | 2 +- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 05fd1405a3f..5f293b4e554 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -415,7 +415,9 @@ class name in this argument. experiment.spec.max_failed_trial_count = max_failed_trial_count # If users choose to use a custom objective function. - if objective is not None: + if objective is not None or parameters is not None: + if not objective or not base_image or not parameters: + raise ValueError("One of the required parameters is None") # Add metrics collector to the Katib Experiment. # Up to now, we only support parameter `kind`, of which default value # is `StdOut`, to specify the kind of metrics collector. @@ -518,6 +520,7 @@ class name in this argument. from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, HuggingFaceModelParams, + HuggingFaceTrainerParams, ) from kubeflow.storage_initializer.s3 import S3DatasetParams from kubeflow.training import models as training_models @@ -596,6 +599,11 @@ class name in this argument. "or HuggingFaceDatasetParams." ) + if not isinstance(trainer_parameters, HuggingFaceTrainerParams): + raise ValueError( + "Trainer parameters must be an instance of HuggingFaceTrainerParams." + ) + # Iterate over input parameters and do substitutions. experiment_params = [] trial_params = [] @@ -633,6 +641,8 @@ class name in this argument. model_provider_parameters.model_uri, "--transformer_type", model_provider_parameters.transformer_type.__name__, + "--num_labels", + str(model_provider_parameters.num_labels), "--model_dir", VOLUME_PATH_MODEL, "--dataset_dir", @@ -643,7 +653,11 @@ class name in this argument. f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker, + resources=( + resources_per_trial.resources_per_worker + if resources_per_trial + else None + ), ) # Create the worker and the master pod. @@ -677,7 +691,10 @@ class name in this argument. ), ) - if resources_per_trial.num_procs_per_worker: + if ( + resources_per_trial is not None + and resources_per_trial.num_procs_per_worker + ): pytorchjob.spec.nproc_per_node = str( resources_per_trial.num_procs_per_worker ) @@ -689,7 +706,7 @@ class name in this argument. ) ) - if resources_per_trial.num_workers > 1: + if resources_per_trial is not None and resources_per_trial.num_workers > 1: pytorchjob.spec.pytorch_replica_specs["Worker"] = ( training_models.KubeflowOrgV1ReplicaSpec( replicas=resources_per_trial.num_workers - 1, diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py index 51e03abd0ac..04a79bcf0d4 100644 --- a/sdk/python/v1beta1/setup.py +++ b/sdk/python/v1beta1/setup.py @@ -86,6 +86,6 @@ ], install_requires=REQUIRES, extras_require={ - "huggingface": ["kubeflow-training[huggingface]==1.8.0"], + "huggingface": ["kubeflow-training[huggingface]==1.8.1"], }, )