Merge pull request #8 from fonhorst/feature/synapse_v0.11.1

Feature/synapse v0.11.1
sb-ai-lab · Aug 24, 2023 · a662a3e · a662a3e
2 parents 3a7a7e0 + 1aa0b9e
commit a662a3e
Show file tree

Hide file tree

Showing 142 changed files with 5,024 additions and 8,098 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -708,4 +708,4 @@ ij_yaml_keep_indents_on_empty_lines = false
 ij_yaml_keep_line_breaks = true
 
 [*.scala]
-indent_size = 2
+indent_size = 2
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -16,8 +16,8 @@ jobs:
  strategy:
  fail-fast: false
  matrix:
- os: [ubuntu-latest, windows-latest, macos-latest]
- python-version: [3.6, 3.7, 3.8, 3.9]
+ os: [ubuntu-latest]
+ python-version: [3.8, 3.9]
 
  steps:
  - uses: actions/checkout@v2
@@ -27,19 +27,18 @@ jobs:
  with:
  python-version: ${{ matrix.python-version }}
 
- - uses: Gr1N/setup-poetry@v7
+ - uses: Gr1N/setup-poetry@v8
  with:
- poetry-version: 1.1.7
-
- - name: install libomp for MacOS
- if: ${{ matrix.os == 'macos-latest' }}
- run: brew install libomp
+ poetry-version: 1.4.1
 
  - name: install tox
  run: |
  pip3 install tox==3.24.4
  pip3 install tox-gh-actions==2.8.1
 
+ - name: download datasets
+ run: ./bin/download-datasets.sh
+
  - name: test with tox
  run: |
  tox
diff --git a/.gitignore b/.gitignore
@@ -176,4 +176,3 @@ spark-warehouse
 
 # sphinx output folder
 html-docs
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
  - repo: https:/psf/black
- rev: 20.8b1
+ rev: 23.7.0
  hooks:
  - id: black
  args: [--line-length=120]
@@ -17,7 +17,6 @@ repos:
  - id: trailing-whitespace
  - id: end-of-file-fixer
  - id: debug-statements
-
  - repo: https:/pycqa/flake8.git
  rev: 3.8.4
  hooks:

diff --git a/Dockerfile b/Dockerfile
@@ -4,7 +4,7 @@ RUN apt-get update && \
  apt-get install -y openjdk-11-jre net-tools wget nano iputils-ping curl && \
  apt-get clean && \
  rm -rf /var/lib/apt/lists/*
- 
+
 ENV SCALA_VERSION=2.12.10
 RUN wget http://scala-lang.org/files/archive/scala-${SCALA_VERSION}.deb && \
  dpkg -i scala-${SCALA_VERSION}.deb
@@ -15,7 +15,7 @@ RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SP
  tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
  mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark && \
  rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
- 
+
 ENV SPARK_HOME=/spark
 
 RUN pip install poetry && poetry config virtualenvs.create false
@@ -25,4 +25,4 @@ RUN pip install lightautoml --no-deps
 RUN pip install synapseml
 RUN pip install pyarrow
 COPY lightautoml /lama/lightautoml
-COPY tests /lama/tests
+COPY tests /lama/tests
diff --git a/README.md b/README.md
@@ -7,10 +7,10 @@ It requires:
 2. PySpark 3.2+ (installed as a dependency)
 3. [Synapse ML library](https://microsoft.github.io/SynapseML/)
  (It will be downloaded by Spark automatically)
- 
-Currently, only tabular Preset is supported. See demo with spark-based tabular automl 
-preset in [examples/spark/tabular-preset-automl.py](https:/fonhorst/LightAutoML_Spark/blob/distributed/master/examples/spark/tabular-preset-automl.py). 
-For further information check docs in the root of the project containing dedicated SLAMA section. 
+
+Currently, only tabular Preset is supported. See demo with spark-based tabular automl
+preset in [examples/spark/tabular-preset-automl.py](https:/fonhorst/LightAutoML_Spark/blob/distributed/master/examples/spark/tabular-preset-automl.py).
+For further information check docs in the root of the project containing dedicated SLAMA section.
 
 <a name="apache"></a>
 # License
@@ -40,4 +40,4 @@ poetry config virtualenvs.in-project true
 # Install LAMA
 poetry lock
 poetry install
-```
+```
diff --git a/bin/download-datasets.sh b/bin/download-datasets.sh
@@ -2,7 +2,7 @@
 
 set -ex
 
-dataset_dir="/opt/spark_data/"
+dataset_dir="examples/data"
 
 mkdir -p "${dataset_dir}"
 
@@ -17,5 +17,4 @@ wget https://www.openml.org/data/get_csv/52422/ipums_la_97-small.arff -O ${datas
 
 head -n 25001 ${dataset_dir}/Buzzinsocialmedia_Twitter.csv > ${dataset_dir}/Buzzinsocialmedia_Twitter_25k.csv
 
-cp examples/data/sampled_app_train.csv ${dataset_dir}
-unzip examples/data/small_used_cars_data.zip -d ${dataset_dir}
+unzip ${dataset_dir}/small_used_cars_data.zip -d ${dataset_dir}
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -48,4 +48,4 @@ services:
  - /mnt/ess_storage/DN_1/storage/SLAMA/code:/lama:ro
  - /mnt/ess_storage/DN_1/storage/SLAMA/jars/ivy2_jars:/root/.ivy2/jars:ro
  - /mnt/ess_storage/DN_1/storage/SLAMA/spark_output:/spark_output:z
- - /mnt/ess_storage/DN_1/storage/SLAMA/scripts:/scripts:z
+ - /mnt/ess_storage/DN_1/storage/SLAMA/scripts:/scripts:z
diff --git a/docker/slamactl b/docker/slamactl
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+
+set -ex
+
+function build() {
+ poetry build
+
+ docker build -t slama:latest -f docker/spark-lama.dockerfile .
+}
+
+function run() {
+ docker run -it slama:latest examples/spark/tabular-preset-automl.py
+}
+
+function help() {
+ echo "
+ List of commands.
+ build - builds 'slama:latest' image
+ run - use 'slama:latest' image to run an example (see possible examples in 'examples/spark' directory of this project)
+ help - prints this message
+ "
+}
+
+function main () {
+ cmd="$1"
+
+ if [ -z "${cmd}" ]
+ then
+ echo "No command is provided."
+ help
+ exit 1
+ fi
+
+ shift 1
+
+ echo "Executing command: ${cmd}"
+
+ case "${cmd}" in
+
+ "build")
+ build
+ ;;
+
+ "run")
+ run
+ ;;
+
+ "help")
+ help
+ ;;
+
+ *)
+ echo "Unknown command: ${cmd}"
+ ;;
+
+ esac
+}
+
+main "${@}"
diff --git a/docker/spark-lama.dockerfile b/docker/spark-lama.dockerfile
@@ -1,21 +1,22 @@
 FROM python:3.9.9
 
-RUN pip install poetry
-WORKDIR /code
-#COPY poetry.lock pyproject.toml /code/
-COPY pyproject.toml /code/
-
-RUN poetry config virtualenvs.create false --local
-RUN poetry install
-
 RUN wget https://download.java.net/openjdk/jdk11/ri/openjdk-11+28_linux-x64_bin.tar.gz
 RUN tar -xvf openjdk-11+28_linux-x64_bin.tar.gz
 RUN mv jdk-11 /usr/local/lib/jdk-11
 RUN ln -s /usr/local/lib/jdk-11/bin/java /usr/local/bin/java
 
-RUN pip install pyarrow
+RUN mkdir -p /src
+COPY dist/sparklightautoml_dev-0.3.2-py3-none-any.whl /src
+RUN pip install /src/sparklightautoml_dev-0.3.2-py3-none-any.whl
+
+RUN python3 -c 'from pyspark.sql import SparkSession; SparkSession.builder.config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:0.11.1-spark3.3").config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven").getOrCreate()'
+
+RUN mkdir /src/jars
+COPY jars/spark-lightautoml_2.12-0.1.1.jar /src/jars/
+
+COPY examples /src/examples
+COPY examples/data /opt/spark_data
+
+WORKDIR /src
 
-COPY .. /code
-RUN poetry build
-RUN pip install dist/LightAutoML-0.3.0-py3-none-any.whl
-#COPY ivy2_cache /root/.ivy2/cache
+ENTRYPOINT ["python3"]
diff --git a/docs/dataset.rst b/docs/dataset.rst
@@ -102,4 +102,4 @@ These conditions may change in the future.
  BucketedPersistenceManager
  CompositePersistenceManager
  CompositeBucketedPersistenceManager
- CompositePersistenceManager
+ CompositePersistenceManager
diff --git a/docs/kubernetes_cluster.rst b/docs/kubernetes_cluster.rst
@@ -8,13 +8,13 @@ To run examples locally one needs just ensure that data files lay in appropriate
 These locations typically /opt/spark_data directory.
 (Data for the examples can be found in examples/data)
 
-To run examples remotely on a cluster under Kubernetes control one needs 
+To run examples remotely on a cluster under Kubernetes control one needs
 to have installed and configured **kubectl** utility.
 
 1. Establish nfs / S3
 """""""""""""""""""""
 
-This step is necessary to make uploading of script file 
+This step is necessary to make uploading of script file
 (e.g. executable of Spark LAMA) into a location that is accessible from anywhere on cluster.
 This file will be used by spark driver which is also submitted to the cluster.
 Upon configuring set appropriate value for *spark.kubernetes.file.upload.path* in ``./bin/slamactl.sh`` or mount it to ``/mnt/nfs`` on the localhost.
@@ -29,11 +29,11 @@ Examples required 2 PVC for their functioning (defined in slamactl.sh, spark-sub
 3. Define required env variables
 """"""""""""""""""""""""""""""""
 
-Define required environment variables to use appropriate kubernetes namespace 
+Define required environment variables to use appropriate kubernetes namespace
 and remote docker repository accessible from anywhere in the cluster. ::
 
- export KUBE_NAMESPACE=spark-lama-exps 
- export REPO=node2.bdcl:5000 
+ export KUBE_NAMESPACE=spark-lama-exps
+ export REPO=node2.bdcl:5000
 
 
 4. Build spark lama dependencies and docker images.
@@ -46,15 +46,15 @@ On this step use slamactl.sh utility to build dependencies and docker images: ::
 
 It will:
 
- - compile jars containing Scala-based components 
+ - compile jars containing Scala-based components
  (currently only LAMLStringIndexer required for LE-family transformers)
- 
+
  - download Spark distro and use dockerfiles from there to build base pyspark images
  (and push these images to the remote docker repo)
- 
+
  - compile lama wheel (including spark subpackage) and build a docker image based upon mentioned above pyspark images
  (this image will be pushed to the remote repository too)
- 
+
 5. Run an example on the remote cluster
 """""""""""""""""""""""""""""""""""""""
 

diff --git a/docs/pipelines.features.rst b/docs/pipelines.features.rst
@@ -60,4 +60,4 @@ Utility Functions
  :nosignatures:
  :template: functiontemplate.rst
 
- build_graph
+ build_graph
diff --git a/docs/pipelines.ml.rst b/docs/pipelines.ml.rst
@@ -29,4 +29,4 @@ Pipeline for Nested Cross-Validation
  :nosignatures:
  :template: classtemplate.rst
 
- SparkNestedTabularMLPipeline
+ SparkNestedTabularMLPipeline
diff --git a/docs/pipelines.selection.rst b/docs/pipelines.selection.rst
@@ -29,4 +29,4 @@ Importance Based Selectors
  :nosignatures:
  :template: classtemplate.rst
 
- permutation_importance_based.SparkNpPermutationImportanceEstimator
+ permutation_importance_based.SparkNpPermutationImportanceEstimator
diff --git a/docs/reader.rst b/docs/reader.rst
@@ -35,4 +35,4 @@ Utility functions for advanced roles guessing
  get_gini_func
  get_null_scores
  get_numeric_roles_stat
- get_score_from_pipe
+ get_score_from_pipe
diff --git a/docs/slama_minikube_readme.rst b/docs/slama_minikube_readme.rst
@@ -41,9 +41,9 @@ Environment setup
 
 
 5. Create several folders to be used for data storage and particulary for pv (PersistentVolume) and pvc (PersistentVolumeClaim):
- One may choose different paths. All described below is just a suggestion. 
+ One may choose different paths. All described below is just a suggestion.
 
- * /opt/data-slama - dataset folder. All required datasets, one is planning to work with, 
+ * /opt/data-slama - dataset folder. All required datasets, one is planning to work with,
  should be copied in this folder.
 
  * /opt/result-slama - service folder for intermediate data
@@ -67,8 +67,8 @@ Results of the command execution should look like:
 
 Instead of 'unchanged' state there may be 'created' state if nothing existed before this command was executed.
 
-8. Create pv and pvc to be used by spark application with SLAMA. It is assumed that the folders previously created 
- will be used for this purpose. One may take a look on the example 
+8. Create pv and pvc to be used by spark application with SLAMA. It is assumed that the folders previously created
+ will be used for this purpose. One may take a look on the example
  ~/LightAutoML/dev-tools/config/spark-lama-data-pv-pvc.yaml to create pv and pvc. ::
 
  kubectl apply -f ./dev-tools/config/spark-lama-data-pv-pvc.yaml
@@ -84,7 +84,7 @@ Instead of 'unchanged' state there may be 'created' state if nothing existed bef
  ./bin/slamactl.sh build-lama-image
 
 10. One can check resulting images with the command: ::
- 
+
  docker images
 
 .. image:: imgs/image4.png
@@ -94,7 +94,7 @@ Instead of 'unchanged' state there may be 'created' state if nothing existed bef
 Run examples in minikube
 ========================
 
-1. Ensure that REPO and KUBE_NAMESPACE variables are set. 
+1. Ensure that REPO and KUBE_NAMESPACE variables are set.
  Ensure that all required docker images and kubernetes objects have been created.
 
 2. Go to LigthAutoML folder.
@@ -116,7 +116,7 @@ An example of the result:
 
 .. image:: imgs/image6.png
 
-6. One can open Spark Web UI of SLAMA application on localhost. 
+6. One can open Spark Web UI of SLAMA application on localhost.
  That requires to execute a command for port forwarding to one of localhost ports: ::
 
  kubectl -n spark-lama-exps port-forward svc/$(kubectl -n spark-lama-exps get svc -o jsonpath='{.items[0].metadata.name}') 9040:4040 --address='0.0.0.0'
@@ -125,4 +125,4 @@ To open Spark WebUI follow the link <http://localhost:9040>
 
 .. image:: imgs/image7.png
 
-Note: SLAMA application should be in running state. 
+Note: SLAMA application should be in running state.
diff --git a/docs/transformers.rst b/docs/transformers.rst
@@ -108,4 +108,4 @@ Datetime
  SparkTimeToNumTransformer
  SparkBaseDiffTransformer
  SparkDateSeasonsTransformer
- SparkDatetimeHelper
+ SparkDatetimeHelper
diff --git a/docs/validation.rst b/docs/validation.rst
@@ -20,4 +20,4 @@ Iterators
  ~base.SparkBaseTrainValidIterator
  ~iterators.SparkHoldoutIterator
  ~iterators.SparkFoldsIterator
- ~iterators.SparkDummyIterator
+ ~iterators.SparkDummyIterator