From ed1000617f5d44d6d1f470f51055f5d300554cbb Mon Sep 17 00:00:00 2001 From: bubriks Date: Wed, 10 Sep 2025 17:07:02 +0300 Subject: [PATCH] init --- docs/user_guides/fs/data_source/usage.md | 15 ++++++++------- docs/user_guides/fs/feature_group/create.md | 2 +- .../fs/feature_group/create_external.md | 8 ++++---- docs/user_guides/fs/provenance/provenance.md | 12 ++++++------ 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/docs/user_guides/fs/data_source/usage.md b/docs/user_guides/fs/data_source/usage.md index 84fde75db..0d539f252 100644 --- a/docs/user_guides/fs/data_source/usage.md +++ b/docs/user_guides/fs/data_source/usage.md @@ -20,7 +20,7 @@ We retrieve a data source simply by its unique name. project = hopsworks.login() feature_store = project.get_feature_store() # Retrieve data source - connector = feature_store.get_storage_connector('data_source_name') + ds = feature_store.get_data_source('data_source_name') ``` === "Scala" @@ -123,19 +123,20 @@ The `Connector API` relies on data sources behind the scenes to integrate with e This enables seamless integration with any data source as long as there is a data source defined. To create an external feature group, we use the `create_external_feature_group` API, also known as `Connector API`, -and simply pass the data source created before to the `storage_connector` argument. +and simply pass the data source created before to the `data_source` argument. Depending on the external source, we should set either the `query` argument for data warehouse based sources, or the `path` and `data_format` arguments for data lake based sources, similar to reading into dataframes as explained in above section. -Example for any data warehouse/SQL based external sources, we set the desired SQL to `query` argument, and set the `storage_connector` +Example for any data warehouse/SQL based external sources, we set the desired SQL to `query` argument, and set the `data_source` argument to the data source object of desired data source. === "PySpark" ```python + ds.query="SELECT * FROM TABLE" + fg = feature_store.create_external_feature_group(name="sales", version=1 description="Physical shop sales features", - query="SELECT * FROM TABLE", - storage_connector=connector, + data_source = ds, primary_key=['ss_store_sk'], event_time='sale_date' ) @@ -148,7 +149,7 @@ For more information on `Connector API`, read detailed guide about [external fea ## Writing Training Data Data Sources are also used while writing training data to external sources. While calling the -[Feature View](../../../concepts/fs/feature_view/fv_overview.md) API `create_training_data` , we can pass the `storage_connector` argument which is necessary to materialise +[Feature View](../../../concepts/fs/feature_view/fv_overview.md) API `create_training_data` , we can pass the `data_source` argument which is necessary to materialise the data to external sources, as shown below. === "PySpark" @@ -158,7 +159,7 @@ the data to external sources, as shown below. description = 'describe training data', data_format = 'spark_data_format', # e.g. data_format = "parquet" or data_format = "csv" write_options = {"wait_for_job": False}, - storage_connector = connector + data_source = ds ) ``` diff --git a/docs/user_guides/fs/feature_group/create.md b/docs/user_guides/fs/feature_group/create.md index b7e25c817..4e9cf3527 100644 --- a/docs/user_guides/fs/feature_group/create.md +++ b/docs/user_guides/fs/feature_group/create.md @@ -87,7 +87,7 @@ When you create a feature group, you can specify the table format you want to us ##### Data Source -During the creation of a feature group, it is possible to define the `storage_connector` parameter, this allows for management of offline data in the desired table format outside the Hopsworks cluster. Currently, [S3](../data_source/creation/s3.md) and [GCS](../data_source/creation/gcs.md) connectors and "DELTA" `time_travel_format` format is supported. +During the creation of a feature group, it is possible to define the `data_source` parameter, this allows for management of offline data in the desired table format outside the Hopsworks cluster. Currently, [S3](../data_source/creation/s3.md) and [GCS](../data_source/creation/gcs.md) connectors and "DELTA" `time_travel_format` format is supported. ##### Online Table Configuration diff --git a/docs/user_guides/fs/feature_group/create_external.md b/docs/user_guides/fs/feature_group/create_external.md index ac29c6e34..ee4e3d106 100644 --- a/docs/user_guides/fs/feature_group/create_external.md +++ b/docs/user_guides/fs/feature_group/create_external.md @@ -21,7 +21,7 @@ To create an external feature group using the HSFS APIs you need to provide an e === "Python" ```python - connector = feature_store.get_storage_connector("data_source_name") + ds = feature_store.get_data_source("data_source_name") ``` ### Create an External Feature Group @@ -50,7 +50,7 @@ The first step is to instantiate the metadata through the `create_external_featu version=1, description="Physical shop sales features", query=query, - storage_connector=connector, + data_source=ds, primary_key=['ss_store_sk'], event_time='sale_date' ) @@ -67,7 +67,7 @@ The first step is to instantiate the metadata through the `create_external_featu version=1, description="Physical shop sales features", data_format="parquet", - storage_connector=connector, + data_source=ds, primary_key=['ss_store_sk'], event_time='sale_date' ) @@ -105,7 +105,7 @@ You can enable online storage for external feature groups, however, the sync fro version=1, description="Physical shop sales features", query=query, - storage_connector=connector, + data_source=ds, primary_key=['ss_store_sk'], event_time='sale_date', online_enabled=True) diff --git a/docs/user_guides/fs/provenance/provenance.md b/docs/user_guides/fs/provenance/provenance.md index 4362d9247..6dff711a9 100644 --- a/docs/user_guides/fs/provenance/provenance.md +++ b/docs/user_guides/fs/provenance/provenance.md @@ -32,27 +32,27 @@ The relationship between data sources and feature groups is captured automatical ```python # Retrieve the data source - snowflake_sc = fs.get_storage_connector("snowflake_sc") + ds = fs.get_data_source("snowflake_sc") + ds.query = "SELECT * FROM USER_PROFILES" # Create the user profiles feature group user_profiles_fg = fs.create_external_feature_group( name="user_profiles", version=1, - storage_connector=snowflake_sc, - query="SELECT * FROM USER_PROFILES" + data_source=ds ) user_profiles_fg.save() ``` ### Using the APIs -Starting from a feature group metadata object, you can traverse upstream the provenance graph to retrieve the metadata objects of the data sources that are part of the feature group. To do so, you can use the [get_storage_connector_provenance](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#get_storage_connector_provenance) method. +Starting from a feature group metadata object, you can traverse upstream the provenance graph to retrieve the metadata objects of the data sources that are part of the feature group. To do so, you can use the [get_data_source_provenance](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/feature_group_api/#get_data_source_provenance) method. === "Python" ```python # Returns all data sources linked to the provided feature group - lineage = user_profiles_fg.get_storage_connector_provenance() + lineage = user_profiles_fg.get_data_source_provenance() # List all accessible parent data sources lineage.accessible @@ -68,7 +68,7 @@ Starting from a feature group metadata object, you can traverse upstream the pro ```python # Returns an accessible data source linked to the feature group (if it exists) - user_profiles_fg.get_storage_connector() + user_profiles_fg.get_data_source() ``` To traverse the provenance graph in the opposite direction (i.e. from the data source to the feature group), you can use the [get_feature_groups_provenance](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/storage_connector_api/#get_feature_groups_provenance) method. When navigating the provenance graph downstream, the `deleted` feature groups are not tracked by provenance, as such, the `deleted` property will always return an empty list.