datahub-project · treff7es · Jul 17, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/metadata-ingestion/docs/sources/abs/README.md b/metadata-ingestion/docs/sources/abs/README.md
@@ -0,0 +1,40 @@
+This connector ingests Azure Blob Storage (abbreviated to abs) datasets into DataHub. It allows mapping an individual
+file or a folder of files to a dataset in DataHub.
+To specify the group of files that form a dataset, use `path_specs` configuration in ingestion recipe. Refer
-To specify the group of files that form a dataset, use `path_specs` configuration in ingestion recipe. Refer
+To specify the group of files that form a dataset, use `path_specs` configuration in ingestion recipe. Refer to the section [Path Specs](https://datahubproject.io/docs/generated/ingestion/sources/s3/#path-specs) for more details.
-To specify the group of files that form a dataset, use `path_specs` configuration in ingestion recipe. Refer
+To specify the group of files that form a dataset, use `path_specs` configuration in ingestion recipe. Refer to the section [Path Specs](https://datahubproject.io/docs/generated/ingestion/sources/s3/#path-specs) for more details.
+section [Path Specs](https://datahubproject.io/docs/generated/ingestion/sources/s3/#path-specs) for more details.
+
+### Concept Mapping
+
+This ingestion source maps the following Source System Concepts to DataHub Concepts:
+
+| Source Concept | DataHub Concept | Notes |
+|----------------------------------------|--------------------------------------------------------------------------------------------|------------------|
+| `"abs"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataplatform/) | |
+| abs blob / Folder containing abs blobs | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | |
+| abs container | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `Folder` |
+
+This connector supports both local files and those stored on Azure Blob Storage (which must be identified using the
+prefix `http(s)://<account>.blob.core.windows.net/` or `azure://`).
+
+### Supported file types
+
+Supported file types are as follows:
+
+- CSV (*.csv)
+- TSV (*.tsv)
+- JSONL (*.jsonl)
+- JSON (*.json)
+- Parquet (*.parquet)
+- Apache Avro (*.avro)
+
+Schemas for Parquet and Avro files are extracted as provided.
+
+Schemas for schemaless formats (CSV, TSV, JSONL, JSON) are inferred. For CSV, TSV and JSONL files, we consider the first
+100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details))
-100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details))
+100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#profiling))
-100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details))
+100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#profiling))
+JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few
+objects of the file), which may impact performance.
+We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object.
+
+### Profiling
+
+Profiling is not available in the current release.
diff --git a/metadata-ingestion/docs/sources/abs/abs.md b/metadata-ingestion/docs/sources/abs/abs.md
@@ -0,0 +1,204 @@
+
+### Path Specs
+
+Path Specs (`path_specs`) is a list of Path Spec (`path_spec`) objects, where each individual `path_spec` represents one or more datasets. The include path (`path_spec.include`) represents a formatted path to the dataset. This path must end with `*.*` or `*.[ext]` to represent the leaf level. If `*.[ext]` is provided, then only files with the specified extension type will be scanned. "`.[ext]`" can be any of the [supported file types](#supported-file-types). Refer to [example 1](#example-1---individual-file-as-dataset) below for more details.
+
+All folder levels need to be specified in the include path. You can use `/*/` to represent a folder level and avoid specifying the exact folder name. To map a folder as a dataset, use the `{table}` placeholder to represent the folder level for which the dataset is to be created. For a partitioned dataset, you can use the placeholder `{partition_key[i]}` to represent the name of the `i`th partition and `{partition[i]}` to represent the value of the `i`th partition. During ingestion, `i` will be used to match the partition_key to the partition. Refer to [examples 2 and 3](#example-2---folder-of-files-as-dataset-without-partitions) below for more details.
+
+Exclude paths (`path_spec.exclude`) can be used to ignore paths that are not relevant to the current `path_spec`. This path cannot have named variables (`{}`). The exclude path can have `**` to represent multiple folder levels. Refer to [example 4](#example-4---folder-of-files-as-dataset-with-partitions-and-exclude-filter) below for more details.
+
+Refer to [example 5](#example-5---advanced---either-individual-file-or-folder-of-files-as-dataset) if your container has a more complex dataset representation.
+
+**Additional points to note**
-**Additional points to note**
+### Additional Points to Note
-**Additional points to note**
+### Additional Points to Note
+- Folder names should not contain {, }, *, / in their names.
+- Named variable {folder} is reserved for internal working. please do not use in named variables.
+
+
+### Path Specs - Examples
+#### Example 1 - Individual file as Dataset
-### Path Specs - Examples
-#### Example 1 - Individual file as Dataset
+### Path Specs - Examples
+
+#### Example 1 - Individual file as Dataset
-### Path Specs - Examples
-#### Example 1 - Individual file as Dataset
+### Path Specs - Examples
+
+#### Example 1 - Individual file as Dataset
+
+Container structure:
+
+```
+test-container
+├── employees.csv
+├── departments.json
+└── food_items.csv
+```
+
+Path specs config to ingest `employees.csv` and `food_items.csv` as datasets:
+```
+path_specs:
+ - include: https://storageaccountname.blob.core.windows.net/test-container/*.csv
+
+```
+This will automatically ignore `departments.json` file. To include it, use `*.*` instead of `*.csv`.
+
+#### Example 2 - Folder of files as Dataset (without Partitions)
+
+Container structure:
+```
+test-container
+└── offers
+ ├── 1.avro
+ └── 2.avro
+
+```
+
+Path specs config to ingest folder `offers` as dataset:
+```
+path_specs:
+ - include: https://storageaccountname.blob.core.windows.net/test-container/{table}/*.avro
+```
+
+`{table}` represents folder for which dataset will be created.
+
-### Path Specs - Examples
-#### Example 1 - Individual file as Dataset
-
-Container structure:
-
-```
-test-container
-├── employees.csv
-├── departments.json
-└── food_items.csv
-```
-
-Path specs config to ingest `employees.csv` and `food_items.csv` as datasets:
-```
-path_specs:
- - include: https://storageaccountname.blob.core.windows.net/test-container/*.csv
-
-```
-This will automatically ignore `departments.json` file. To include it, use `*.*` instead of `*.csv`.
-
-#### Example 2 - Folder of files as Dataset (without Partitions)
-
-Container structure:
-```
-test-container
-└── offers
-  ├── 1.avro
-  └── 2.avro
-
-```
-
-Path specs config to ingest folder `offers` as dataset:
-```
-path_specs:
- - include: https://storageaccountname.blob.core.windows.net/test-container/{table}/*.avro
-```
-
-`{table}` represents folder for which dataset will be created.
- 
+`{table}` represents the folder for which the dataset will be created.
-### Path Specs - Examples
-#### Example 1 - Individual file as Dataset
-
-Container structure:
-
-```
-test-container
-├── employees.csv
-├── departments.json
-└── food_items.csv
-```
-
-Path specs config to ingest `employees.csv` and `food_items.csv` as datasets:
-```
-path_specs:
- - include: https://storageaccountname.blob.core.windows.net/test-container/*.csv
-
-```
-This will automatically ignore `departments.json` file. To include it, use `*.*` instead of `*.csv`.
-
-#### Example 2 - Folder of files as Dataset (without Partitions)
-
-Container structure:
-```
-test-container
-└── offers
-  ├── 1.avro
-  └── 2.avro
-
-```
-
-Path specs config to ingest folder `offers` as dataset:
-```
-path_specs:
- - include: https://storageaccountname.blob.core.windows.net/test-container/{table}/*.avro
-```
-
-`{table}` represents folder for which dataset will be created.
- 
+`{table}` represents the folder for which the dataset will be created.
+#### Example 3 - Folder of files as Dataset (with Partitions)
+
+Container structure:
+```
+test-container
+├── orders
+│ └── year=2022
+│ └── month=2
+│ ├── 1.parquet
+│ └── 2.parquet
+└── returns
+ └── year=2021
+ └── month=2
+ └── 1.parquet
+
+```
+
+Path specs config to ingest folders `orders` and `returns` as datasets:
+```
+path_specs:
+ - include: https://storageaccountname.blob.core.windows.net/test-container/{table}/{partition_key[0]}={partition[0]}/{partition_key[1]}={partition[1]}/*.parquet
+```
+
+One can also use `include: https://storageaccountname.blob.core.windows.net/test-container/{table}/*/*/*.parquet` here however above format is preferred as it allows declaring partitions explicitly.
+
+#### Example 4 - Folder of files as Dataset (with Partitions), and Exclude Filter
+
+Container structure:
+```
+test-container
+├── orders
+│ └── year=2022
+│ └── month=2
+│ ├── 1.parquet
+│ └── 2.parquet
+└── tmp_orders
+ └── year=2021
+ └── month=2
+ └── 1.parquet
+
+
+```
+
+Path specs config to ingest folder `orders` as dataset but not folder `tmp_orders`:
+```
+path_specs:
+ - include: https://storageaccountname.blob.core.windows.net/test-container/{table}/{partition_key[0]}={partition[0]}/{partition_key[1]}={partition[1]}/*.parquet
+ exclude: 
+ - **/tmp_orders/**
+```
+
+
+#### Example 5 - Advanced - Either Individual file OR Folder of files as Dataset
+
+Container structure:
+```
+test-container
+├── customers
+│ ├── part1.json
+│ ├── part2.json
+│ ├── part3.json
+│ └── part4.json
+├── employees.csv
+├── food_items.csv
+├── tmp_10101000.csv
+└── orders
+ └── year=2022
+ └── month=2
+ ├── 1.parquet
+ ├── 2.parquet
+ └── 3.parquet
+
+```
+
+Path specs config:
+```
+path_specs:
+ - include: https://storageaccountname.blob.core.windows.net/test-container/*.csv
+ exclude:
+ - **/tmp_10101000.csv
+ - include: https://storageaccountname.blob.core.windows.net/test-container/{table}/*.json
+ - include: https://storageaccountname.blob.core.windows.net/test-container/{table}/{partition_key[0]}={partition[0]}/{partition_key[1]}={partition[1]}/*.parquet
+```
+
+Above config has 3 path_specs and will ingest following datasets
+- `employees.csv` - Single File as Dataset
+- `food_items.csv` - Single File as Dataset
+- `customers` - Folder as Dataset
+- `orders` - Folder as Dataset
+ and will ignore file `tmp_10101000.csv`
+
+**Valid path_specs.include**
-**Valid path_specs.include**
+### Valid path_specs.include
-**Valid path_specs.include**
+### Valid path_specs.include
+
+```python
+https://storageaccountname.blob.core.windows.net/my-container/foo/tests/bar.avro # single file table 
+https://storageaccountname.blob.core.windows.net/my-container/foo/tests/*.* # mulitple file level tables
+https://storageaccountname.blob.core.windows.net/my-container/foo/tests/{table}/*.avro #table without partition
+https://storageaccountname.blob.core.windows.net/my-container/foo/tests/{table}/*/*.avro #table where partitions are not specified
+https://storageaccountname.blob.core.windows.net/my-container/foo/tests/{table}/*.* # table where no partitions as well as data type specified
+https://storageaccountname.blob.core.windows.net/my-container/{dept}/tests/{table}/*.avro # specifying keywords to be used in display name
+https://storageaccountname.blob.core.windows.net/my-container/{dept}/tests/{table}/{partition_key[0]}={partition[0]}/{partition_key[1]}={partition[1]}/*.avro # specify partition key and value format
+https://storageaccountname.blob.core.windows.net/my-container/{dept}/tests/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.avro # specify partition value only format
+https://storageaccountname.blob.core.windows.net/my-container/{dept}/tests/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # for all extensions
+https://storageaccountname.blob.core.windows.net/my-container/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 2 levels down in container
+https://storageaccountname.blob.core.windows.net/my-container/*/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 3 levels down in container
+```
+
+**Valid path_specs.exclude**
-**Valid path_specs.exclude**
+### Valid path_specs.exclude
-**Valid path_specs.exclude**
+### Valid path_specs.exclude
+- \**/tests/**
+- https://storageaccountname.blob.core.windows.net/my-container/hr/**
+- **/tests/*.csv
+- https://storageaccountname.blob.core.windows.net/my-container/foo/*/my_table/**
+
+
+
+If you would like to write a more complicated function for resolving file names, then a {transformer} would be a good fit.
+
+:::caution
+
+Specify as long fixed prefix ( with out /*/ ) as possible in `path_specs.include`. This will reduce the scanning time and cost, specifically on AWS S3
+
+:::
+
+:::caution
+
+Running profiling against many tables or over many rows can run up significant costs.
+While we've done our best to limit the expensiveness of the queries the profiler runs, you
+should be prudent about the set of tables profiling is enabled on or the frequency
+of the profiling runs.
+
+:::
+
+:::caution
+
+If you are ingesting datasets from AWS S3, we recommend running the ingestion on a server in the same region to avoid high egress costs.
+
+:::
+
+### Compatibility
+
+Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz).
+
+For an example guide on setting up PyDeequ on AWS, see [this guide](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/).
+
+:::caution
+
+From Spark 3.2.0+, Avro reader fails on column names that don't start with a letter and contains other character than letters, number, and underscore. [https:/apache/spark/blob/72c62b6596d21e975c5597f8fff84b1a9d070a02/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala#L158] 
+Avro files that contain such columns won't be profiled.
+:::
diff --git a/metadata-ingestion/docs/sources/abs/abs_recipe.yml b/metadata-ingestion/docs/sources/abs/abs_recipe.yml
@@ -0,0 +1,13 @@
+source:
+ type: abs
+ config:
+ path_specs:
+ - include: "https://storageaccountname.blob.core.windows.net/covid19-lake/covid_knowledge_graph/csv/nodes/*.*"
+
+ azure_config:
+ account_name: "*****"
+ sas_token: "*****"
+ container_name: "covid_knowledge_graph"
+ env: "PROD"
+
+# sink configs
diff --git a/metadata-ingestion/docs/sources/s3/README.md b/metadata-ingestion/docs/sources/s3/README.md
@@ -1,5 +1,5 @@
 This connector ingests AWS S3 datasets into DataHub. It allows mapping an individual file or a folder of files to a dataset in DataHub. 
-To specify the group of files that form a dataset, use `path_specs` configuration in ingestion recipe. Refer section [Path Specs](https://datahubproject.io/docs/generated/ingestion/sources/s3/#path-specs) for more details.
+Refer to the section [Path Specs](https://datahubproject.io/docs/generated/ingestion/sources/s3/#path-specs) for more details.
 
 :::tip
 This connector can also be used to ingest local files.

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -258,13 +258,21 @@
  *path_spec_common,
 }
 
+abs_base = {
+ "azure-core==1.29.4",
+ "azure-identity>=1.14.0",
+ "azure-storage-blob>=12.19.0",
+ "azure-storage-file-datalake>=12.14.0",
+}
+
 data_lake_profiling = {
  "pydeequ~=1.1.0",
  "pyspark~=3.3.0",
 }
 
 delta_lake = {
  *s3_base,
+ *abs_base,
  # Version 0.18.0 broken on ARM Macs: https:/delta-io/delta-rs/issues/2577
  "deltalake>=0.6.3, != 0.6.4, < 0.18.0; platform_system == 'Darwin' and platform_machine == 'arm64'",
  "deltalake>=0.6.3, != 0.6.4; platform_system != 'Darwin' or platform_machine != 'arm64'",
@@ -405,6 +413,7 @@
  | {"cachetools"},
  "s3": {*s3_base, *data_lake_profiling},
  "gcs": {*s3_base, *data_lake_profiling},
+ "abs": {*abs_base},
  "sagemaker": aws_common,
  "salesforce": {"simple-salesforce"},
  "snowflake": snowflake_common | usage_common | sqlglot_lib,
@@ -681,6 +690,7 @@
  "demo-data = datahub.ingestion.source.demo_data.DemoDataSource",
  "unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource",
  "gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource",
+ "abs = datahub.ingestion.source.abs.source:ABSSource",
  "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource",
  "fivetran = datahub.ingestion.source.fivetran.fivetran:FivetranSource",
  "qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource",

diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/abs/__init__.py