Skip to content

Commit

Permalink
fix tune api error
Browse files Browse the repository at this point in the history
Signed-off-by: helenxie-bit <[email protected]>
  • Loading branch information
helenxie-bit committed Jan 18, 2025
1 parent 93bee4d commit 08ea52b
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 5 deletions.
25 changes: 21 additions & 4 deletions sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,9 @@ class name in this argument.
experiment.spec.max_failed_trial_count = max_failed_trial_count

# If users choose to use a custom objective function.
if objective is not None:
if objective is not None or parameters is not None:
if not objective or not base_image or not parameters:
raise ValueError("One of the required parameters is None")
# Add metrics collector to the Katib Experiment.
# Up to now, we only support parameter `kind`, of which default value
# is `StdOut`, to specify the kind of metrics collector.
Expand Down Expand Up @@ -518,6 +520,7 @@ class name in this argument.
from kubeflow.storage_initializer.hugging_face import (
HuggingFaceDatasetParams,
HuggingFaceModelParams,
HuggingFaceTrainerParams,
)
from kubeflow.storage_initializer.s3 import S3DatasetParams
from kubeflow.training import models as training_models
Expand Down Expand Up @@ -596,6 +599,11 @@ class name in this argument.
"or HuggingFaceDatasetParams."
)

if not isinstance(trainer_parameters, HuggingFaceTrainerParams):
raise ValueError(
"Trainer parameters must be an instance of HuggingFaceTrainerParams."
)

# Iterate over input parameters and do substitutions.
experiment_params = []
trial_params = []
Expand Down Expand Up @@ -633,6 +641,8 @@ class name in this argument.
model_provider_parameters.model_uri,
"--transformer_type",
model_provider_parameters.transformer_type.__name__,
"--num_labels",
str(model_provider_parameters.num_labels),
"--model_dir",
VOLUME_PATH_MODEL,
"--dataset_dir",
Expand All @@ -643,7 +653,11 @@ class name in this argument.
f"'{training_args}'",
],
volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
resources=resources_per_trial.resources_per_worker,
resources=(
resources_per_trial.resources_per_worker
if resources_per_trial
else None
),
)

# Create the worker and the master pod.
Expand Down Expand Up @@ -677,7 +691,10 @@ class name in this argument.
),
)

if resources_per_trial.num_procs_per_worker:
if (
resources_per_trial is not None
and resources_per_trial.num_procs_per_worker
):
pytorchjob.spec.nproc_per_node = str(
resources_per_trial.num_procs_per_worker
)
Expand All @@ -689,7 +706,7 @@ class name in this argument.
)
)

if resources_per_trial.num_workers > 1:
if resources_per_trial is not None and resources_per_trial.num_workers > 1:
pytorchjob.spec.pytorch_replica_specs["Worker"] = (
training_models.KubeflowOrgV1ReplicaSpec(
replicas=resources_per_trial.num_workers - 1,
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/v1beta1/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,6 @@
],
install_requires=REQUIRES,
extras_require={
"huggingface": ["kubeflow-training[huggingface]==1.8.0"],
"huggingface": ["kubeflow-training[huggingface]==1.8.1"],
},
)

0 comments on commit 08ea52b

Please sign in to comment.