fix tune api error

Signed-off-by: helenxie-bit <[email protected]>
kubeflow · Jan 18, 2025 · 08ea52b · 08ea52b
1 parent 93bee4d
commit 08ea52b
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 5 deletions.
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -415,7 +415,9 @@ class name in this argument.
             experiment.spec.max_failed_trial_count = max_failed_trial_count
 
         # If users choose to use a custom objective function.
-        if objective is not None:
+        if objective is not None or parameters is not None:
+            if not objective or not base_image or not parameters:
+                raise ValueError("One of the required parameters is None")
             # Add metrics collector to the Katib Experiment.
             # Up to now, we only support parameter `kind`, of which default value
             # is `StdOut`, to specify the kind of metrics collector.
@@ -518,6 +520,7 @@ class name in this argument.
                 from kubeflow.storage_initializer.hugging_face import (
                     HuggingFaceDatasetParams,
                     HuggingFaceModelParams,
+                    HuggingFaceTrainerParams,
                 )
                 from kubeflow.storage_initializer.s3 import S3DatasetParams
                 from kubeflow.training import models as training_models
@@ -596,6 +599,11 @@ class name in this argument.
                     "or HuggingFaceDatasetParams."
                 )
 
+            if not isinstance(trainer_parameters, HuggingFaceTrainerParams):
+                raise ValueError(
+                    "Trainer parameters must be an instance of HuggingFaceTrainerParams."
+                )
+
             # Iterate over input parameters and do substitutions.
             experiment_params = []
             trial_params = []
@@ -633,6 +641,8 @@ class name in this argument.
                     model_provider_parameters.model_uri,
                     "--transformer_type",
                     model_provider_parameters.transformer_type.__name__,
+                    "--num_labels",
+                    str(model_provider_parameters.num_labels),
                     "--model_dir",
                     VOLUME_PATH_MODEL,
                     "--dataset_dir",
@@ -643,7 +653,11 @@ class name in this argument.
                     f"'{training_args}'",
                 ],
                 volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
-                resources=resources_per_trial.resources_per_worker,
+                resources=(
+                    resources_per_trial.resources_per_worker
+                    if resources_per_trial
+                    else None
+                ),
             )
 
             # Create the worker and the master pod.
@@ -677,7 +691,10 @@ class name in this argument.
                 ),
             )
 
-            if resources_per_trial.num_procs_per_worker:
+            if (
+                resources_per_trial is not None
+                and resources_per_trial.num_procs_per_worker
+            ):
                 pytorchjob.spec.nproc_per_node = str(
                     resources_per_trial.num_procs_per_worker
                 )
@@ -689,7 +706,7 @@ class name in this argument.
                 )
             )
 
-            if resources_per_trial.num_workers > 1:
+            if resources_per_trial is not None and resources_per_trial.num_workers > 1:
                 pytorchjob.spec.pytorch_replica_specs["Worker"] = (
                     training_models.KubeflowOrgV1ReplicaSpec(
                         replicas=resources_per_trial.num_workers - 1,

diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py
@@ -86,6 +86,6 @@
     ],
     install_requires=REQUIRES,
     extras_require={
-        "huggingface": ["kubeflow-training[huggingface]==1.8.0"],
+        "huggingface": ["kubeflow-training[huggingface]==1.8.1"],
     },
 )