From f93f99d918fe4febc28f8b1ab45bc38c50bd59fa Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 21 Jun 2024 01:13:55 +0000 Subject: [PATCH] sync model docs: 2024-06-21T01:13:55Z --- module/model/user/generated/10_model_schema.md | 6 ++++-- .../user/generated/11_model_observability.md | 16 ++++++++-------- module/model/user/templates/10_model_schema.md | 6 ++++-- .../user/templates/11_model_observability.md | 16 ++++++++-------- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/module/model/user/generated/10_model_schema.md b/module/model/user/generated/10_model_schema.md index c32dfde..657726a 100644 --- a/module/model/user/generated/10_model_schema.md +++ b/module/model/user/generated/10_model_schema.md @@ -15,7 +15,8 @@ Detail specification is defined by using `InferenceSchema` class, following are |-------|------|-------------|-----------| | `feature_types` | Dict[str, ValueType] | Mapping between feature name with the type of the feature | True | | `model_prediction_output` | PredictionOutput | Prediction specification that differ between model types, e.g BinaryClassificationOutput, RegressionOutput, RankingOutput | True | -| `prediction_id_column` | str | The column name that contains prediction id value | True | +| `session_id_column` | str | The column name that is unique identifier for a request | True | +| `row_id_column` | str | The column name that is unique identifier for a row in a request | True | | `tag_columns` | Optional[List[str]] | List of column names that contains additional information about prediction, you can treat it as metadata | False | From above we can see `model_prediction_output` field that has type `PredictionOutput`, this field is a specification of prediction that is generated by the model depending on it's model type. Currently we support 3 model types in the schema: @@ -73,7 +74,8 @@ from merlin.observability.inference import InferenceSchema, ValueType, BinaryCla "featureC": ValueType.STRING, "featureD": ValueType.BOOLEAN }, - prediction_id_column="prediction_id", + session_id_column="session_id", + row_id_column="row_id", model_prediction_output=BinaryClassificationOutput( prediction_score_column="score", actual_label_column="target", diff --git a/module/model/user/generated/11_model_observability.md b/module/model/user/generated/11_model_observability.md index ff063f1..e7a403e 100644 --- a/module/model/user/generated/11_model_observability.md +++ b/module/model/user/generated/11_model_observability.md @@ -33,17 +33,17 @@ Beside changes in signature, you can see some of those methods returning new typ | Field | Type | Description| |-------|------|------------| -| `prediction_ids` | List[str] | Unique identifier for each row in prediction | -| `features` | Union[Values, pandas.DataFrame] | Features value that is used by the model to generate prediction. Length of features should be the same with `prediction_ids` | +| `row_ids` | List[str] | Unique identifier for each row in prediction | +| `features` | Union[Values, pandas.DataFrame] | Features value that is used by the model to generate prediction. Length of features should be the same with `row_ids` | | `entities` | Optional[Union[Values, pandas.DataFrame]] | Additional data that are not used for prediction, but this data is used to retrieved another features, e.g `driver_id`, we can retrieve features associated with certain `driver_id`| -| `session_id` | str | Identifier for the request. This value will be used together with `prediction_ids` as prediction identifier in model observability system | +| `session_id` | str | Identifier for the request. This value will be used together with `row_ids` as prediction identifier in model observability system | `ModelInput` data is essential for model observability since it contains features values and identifier of prediction. Features values are used to calculate feature drift, and identifier is used as join key between features, prediction data with ground truth data. On the other hand, `ModelOutput` is the class that represent raw model prediction output, not the final output of PyFunc model. `ModelOutput` class contains following fields: | Field | Type | Description | |-------|------|-------------| | `prediction` | Values | `predictions` contains prediction output from ml_predict, it may contains multiple columns e.g for multiclass classification or for binary classification that contains prediction score and label | -| `prediction_ids` | List[str] | Unique identifier for each row in prediction output | +| `row_ids` | List[str] | Unique identifier for each row in prediction output | Same like `ModelInput`, `ModelOutput` is also essential for model observability, it can be used to calculate prediction drift but more importantly it can calculate performance metrics. @@ -61,7 +61,7 @@ There is not much change on the deployment part, users just needs to set `enable * featureC that has string type * featureD that has float type -The model type is ranking with prediction group id information is located in `session_id` column, prediction id in `prediction_id` column, rank score in `score` column and `relevance_score_column` in `relevance_score`. Below is the snipped of the python code +The model type is ranking with prediction group id information is located in `session_id` column, row id in `row_id` column, rank score in `score` column and `relevance_score_column` in `relevance_score`. Below is the snipped of the python code ```python class ModelObservabilityModel(PyFuncV3Model): @@ -69,13 +69,13 @@ class ModelObservabilityModel(PyFuncV3Model): def preprocess(self, request: dict, **kwargs) -> ModelInput: return ModelInput( session_id="session_id", - prediction_ids=["prediction_1", "prediction_2"], + row_ids=["prediction_1", "prediction_2"], features=pd.DataFrame([[0.7, 200, "ID", True], [0.99, 250, "SG", False]], columns=["featureA", "featureB", "featureC", "featureD"]), ) def infer(self, model_input: ModelInput) -> ModelOutput: return ModelOutput( - prediction_ids=model_input.prediction_ids, + row_ids=model_input.row_ids, predictions=Values(columns=["score"], data=[[0.5], [0.9]]), ) def postprocess(self, model_output: ModelOutput, request: dict) -> dict: @@ -90,7 +90,7 @@ model_schema = ModelSchema(spec=InferenceSchema( "featureD": ValueType.BOOLEAN }, session_id_column="session_id", - row_id_column="prediction_id", + row_id_column="row_id", model_prediction_output=RankingOutput( rank_score_column="score", prediction_group_id_column="session_id", diff --git a/module/model/user/templates/10_model_schema.md b/module/model/user/templates/10_model_schema.md index c32dfde..657726a 100644 --- a/module/model/user/templates/10_model_schema.md +++ b/module/model/user/templates/10_model_schema.md @@ -15,7 +15,8 @@ Detail specification is defined by using `InferenceSchema` class, following are |-------|------|-------------|-----------| | `feature_types` | Dict[str, ValueType] | Mapping between feature name with the type of the feature | True | | `model_prediction_output` | PredictionOutput | Prediction specification that differ between model types, e.g BinaryClassificationOutput, RegressionOutput, RankingOutput | True | -| `prediction_id_column` | str | The column name that contains prediction id value | True | +| `session_id_column` | str | The column name that is unique identifier for a request | True | +| `row_id_column` | str | The column name that is unique identifier for a row in a request | True | | `tag_columns` | Optional[List[str]] | List of column names that contains additional information about prediction, you can treat it as metadata | False | From above we can see `model_prediction_output` field that has type `PredictionOutput`, this field is a specification of prediction that is generated by the model depending on it's model type. Currently we support 3 model types in the schema: @@ -73,7 +74,8 @@ from merlin.observability.inference import InferenceSchema, ValueType, BinaryCla "featureC": ValueType.STRING, "featureD": ValueType.BOOLEAN }, - prediction_id_column="prediction_id", + session_id_column="session_id", + row_id_column="row_id", model_prediction_output=BinaryClassificationOutput( prediction_score_column="score", actual_label_column="target", diff --git a/module/model/user/templates/11_model_observability.md b/module/model/user/templates/11_model_observability.md index 4b376aa..b667255 100644 --- a/module/model/user/templates/11_model_observability.md +++ b/module/model/user/templates/11_model_observability.md @@ -33,17 +33,17 @@ Beside changes in signature, you can see some of those methods returning new typ | Field | Type | Description| |-------|------|------------| -| `prediction_ids` | List[str] | Unique identifier for each row in prediction | -| `features` | Union[Values, pandas.DataFrame] | Features value that is used by the model to generate prediction. Length of features should be the same with `prediction_ids` | +| `row_ids` | List[str] | Unique identifier for each row in prediction | +| `features` | Union[Values, pandas.DataFrame] | Features value that is used by the model to generate prediction. Length of features should be the same with `row_ids` | | `entities` | Optional[Union[Values, pandas.DataFrame]] | Additional data that are not used for prediction, but this data is used to retrieved another features, e.g `driver_id`, we can retrieve features associated with certain `driver_id`| -| `session_id` | str | Identifier for the request. This value will be used together with `prediction_ids` as prediction identifier in model observability system | +| `session_id` | str | Identifier for the request. This value will be used together with `row_ids` as prediction identifier in model observability system | `ModelInput` data is essential for model observability since it contains features values and identifier of prediction. Features values are used to calculate feature drift, and identifier is used as join key between features, prediction data with ground truth data. On the other hand, `ModelOutput` is the class that represent raw model prediction output, not the final output of PyFunc model. `ModelOutput` class contains following fields: | Field | Type | Description | |-------|------|-------------| | `prediction` | Values | `predictions` contains prediction output from ml_predict, it may contains multiple columns e.g for multiclass classification or for binary classification that contains prediction score and label | -| `prediction_ids` | List[str] | Unique identifier for each row in prediction output | +| `row_ids` | List[str] | Unique identifier for each row in prediction output | Same like `ModelInput`, `ModelOutput` is also essential for model observability, it can be used to calculate prediction drift but more importantly it can calculate performance metrics. @@ -61,7 +61,7 @@ There is not much change on the deployment part, users just needs to set `enable * featureC that has string type * featureD that has float type -The model type is ranking with prediction group id information is located in `session_id` column, prediction id in `prediction_id` column, rank score in `score` column and `relevance_score_column` in `relevance_score`. Below is the snipped of the python code +The model type is ranking with prediction group id information is located in `session_id` column, row id in `row_id` column, rank score in `score` column and `relevance_score_column` in `relevance_score`. Below is the snipped of the python code ```python class ModelObservabilityModel(PyFuncV3Model): @@ -69,13 +69,13 @@ class ModelObservabilityModel(PyFuncV3Model): def preprocess(self, request: dict, **kwargs) -> ModelInput: return ModelInput( session_id="session_id", - prediction_ids=["prediction_1", "prediction_2"], + row_ids=["prediction_1", "prediction_2"], features=pd.DataFrame([[0.7, 200, "ID", True], [0.99, 250, "SG", False]], columns=["featureA", "featureB", "featureC", "featureD"]), ) def infer(self, model_input: ModelInput) -> ModelOutput: return ModelOutput( - prediction_ids=model_input.prediction_ids, + row_ids=model_input.row_ids, predictions=Values(columns=["score"], data=[[0.5], [0.9]]), ) def postprocess(self, model_output: ModelOutput, request: dict) -> dict: @@ -90,7 +90,7 @@ model_schema = ModelSchema(spec=InferenceSchema( "featureD": ValueType.BOOLEAN }, session_id_column="session_id", - row_id_column="prediction_id", + row_id_column="row_id", model_prediction_output=RankingOutput( rank_score_column="score", prediction_group_id_column="session_id",