diff --git a/.gitignore b/.gitignore index 013988e..69b0e7b 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,6 @@ search_engine/indexes* search_engine/resources* transfer/flagged user_session_logs/ + + +*_err_*.log \ No newline at end of file diff --git a/README_INSTALL_ARM-MAC.md b/README_INSTALL_ARM-MAC.md new file mode 100644 index 0000000..a7b529d --- /dev/null +++ b/README_INSTALL_ARM-MAC.md @@ -0,0 +1,82 @@ +# Getting started on arm-based Mac + +This README support the installation on the arm based mac. + + +## 🚀 Setup on arm based mac. +Our code is implemented in Python. To setup, do the following: +1. Install [Python 3.8.13](https://www.python.org/downloads/release/python-3813/) +2. Install [Java](https://www.java.com/en/download/) +3. Download the source code: +```sh +> git clone https://github.com/princeton-nlp/webshop.git webshop +``` +4. Create a virtual environment using [Anaconda](https://anaconda.org/anaconda/python) and activate it +```sh +> conda create -n webshop python=3.8.13 +> conda activate webshop +``` +5. Install requirements into the `webshop` virtual environment via the `setup.sh` script +```sh +> ./setup_arm.sh [-d small|all] +``` + + + + +## Default Installation (Failures): +1. `pip3 install -r requirements.txt` + +Fails at: +- tokenizers +- nmslib +- lightgbm +- transformers==4.19.2 +- PyYAML==6.0.0 + +Fails, because wrong versions installed +- Werkzeug==2.2.2 (needs to be installed for Flask instead of 3.0.0 to work [https://stackoverflow.com/questions/77213053/why-did-flask-start-failing-with-importerror-cannot-import-name-url-quote-fr]) +- numpy-1.24.4 (needs to be installed instead of numpy 1.22 [https://stackoverflow.com/questions/33859531/runtimeerror-module-compiled-against-api-version-a-but-this-version-of-numpy-is]) + +**tokenizers fix**: +`pip3 install tokenizers` + +**nmslib fix**: +`pip3 install Cython` +`pip3 install CFLAGS="-mavx -DWARN(a)=(a)" pip install nmslib` +[https://github.com/nmslib/nmslib/issues/476] + +**lightgbm fix**: +[https://github.com/microsoft/LightGBM/issues/5328] +`brew install libomp` +`pip3 install lightgbm` + + +**transformers fix**: +`pip3 install transformers-4.23.1` works + +**PyYAML fix**: +`pip3 install PyYAML==6.0.1` works + + +## Running setup.sh +Fails at: +- `python -m spacy download en_core_web_lg +` + +**Spacy Fix**: +Interestingly: + +- `pip install -U 'spacy[apple]'` + +Does **NOT** work. + +So, first installing spacy with conda works. + +Remove: +`spacy==3.3.0` from `requirements.txt` + + + + + \ No newline at end of file diff --git a/requirements_arm.txt b/requirements_arm.txt new file mode 100644 index 0000000..fad5d5a --- /dev/null +++ b/requirements_arm.txt @@ -0,0 +1,23 @@ +beautifulsoup4==4.11.1 +cleantext==1.1.4 +env==0.1.0 +Flask==2.1.2 +gdown +gradio +gym==0.24.0 +numpy==1.24.4 +pandas==1.4.2 +pyserini==0.17.0 +pytest +PyYAML==6.0.1 +rank_bm25==0.2.2 +requests==2.27.1 +requests_mock +rich==12.4.4 +scikit_learn==1.1.1 +selenium==4.2.0 +thefuzz==0.19.0 +torch==1.11.0 +tqdm==4.64.0 +train==0.0.5 +transformers==4.23.1 \ No newline at end of file diff --git a/setup_arm.sh b/setup_arm.sh new file mode 100644 index 0000000..e6e2a39 --- /dev/null +++ b/setup_arm.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# Displays information on how to use script +helpFunction() +{ + echo "Usage: $0 [-d small|all]" + echo -e "\t-d small|all - Specify whether to download entire dataset (all) or just 1000 (small)" + exit 1 # Exit script after printing help +} + +# Get values of command line flags +while getopts d: flag +do + case "${flag}" in + d) data=${OPTARG};; + esac +done + +if [ -z "$data" ]; then + echo "[ERROR]: Missing -d flag" + helpFunction +fi + +# Install Python Dependencies +conda install spacy +pip install -r requirements_arm.txt; + +# Install Environment Dependencies via `conda` +conda install -c pytorch faiss-cpu; +conda install -c conda-forge openjdk=11; + +# Download dataset into `data` folder via `gdown` command +mkdir -p data; +cd data; +if [ "$data" == "small" ]; then + gdown https://drive.google.com/uc?id=1EgHdxQ_YxqIQlvvq5iKlCrkEKR6-j0Ib; # items_shuffle_1000 - product scraped info + gdown https://drive.google.com/uc?id=1IduG0xl544V_A_jv3tHXC0kyFi7PnyBu; # items_ins_v2_1000 - product attributes +elif [ "$data" == "all" ]; then + gdown https://drive.google.com/uc?id=1A2whVgOO0euk5O13n2iYDM0bQRkkRduB; # items_shuffle + gdown https://drive.google.com/uc?id=1s2j6NgHljiZzQNL3veZaAiyW_qDEgBNi; # items_ins_v2 +else + echo "[ERROR]: argument for `-d` flag not recognized" + helpFunction +fi +gdown https://drive.google.com/uc?id=14Kb5SPBk_jfdLZ_CDBNitW98QLDlKR5O # items_human_ins +cd .. + +# Download spaCy large NLP model +python -m spacy download en_core_web_sm + +# Build search engine index +cd search_engine +mkdir -p resources resources_100 resources_1k resources_100k +python convert_product_file_format.py # convert items.json => required doc format +mkdir -p indexes +./run_indexing.sh +cd .. + +# Create logging folder + samples of log data +get_human_trajs () { + PYCMD=$(cat <