neurostuff · jdkent · May 24, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
@@ -158,12 +158,20 @@ jobs:
  - 
  name: Create Test Database
  run: |
+ sleep 1
  until docker compose exec -T \
  store_pgsql pg_isready -U postgres; do sleep 1; done
 
+ sleep 1
+
+ docker compose exec -T \
+ store_pgsql \
+ psql -U postgres -c "create database test_db;"
+
  docker compose exec -T \
  store_pgsql \
- psql -U postgres -c "create database test_db"
+ psql -U postgres -d test_db -c "CREATE EXTENSION IF NOT EXISTS vector;"
+
  -
  name: Backend Tests
  env:
@@ -434,12 +442,17 @@ jobs:
  name: Create Store Database
  run: |
  cd store
+ sleep 1
  until docker compose exec -T \
  store_pgsql pg_isready -U postgres; do sleep 1; done
 
+ # docker compose exec -T \
+ # store_pgsql \
+ # psql -U postgres -c "create database neurostore"
+
  docker compose exec -T \
- store_pgsql \
- psql -U postgres -c "create database neurostore"
+  store_pgsql \
+  psql -U postgres -d neurostore -c "CREATE EXTENSION IF NOT EXISTS vector;"
  -
  name: Initialize Compose Database
  run: |

diff --git a/store/.env.example b/store/.env.example
@@ -4,6 +4,8 @@ ACE_DIR=/media
 FILE_DIR=/tmp
 COMPOSE_CONVERT_WINDOWS_PATHS=1
 POSTGRES_HOST=store_pgsql
+POSTGRES_USER=postgres
+POSTGRES_DB=neurostore
 POSTGRES_PASSWORD=example
 BEARERINFO_FUNC=neurostore.resources.auth.decode_token
 AUTH0_CLIENT_ID=YOUR_CLIENT_ID

diff --git a/store/neurostore/ingest/__init__.py b/store/neurostore/ingest/__init__.py
@@ -469,9 +469,6 @@ def ace_ingestion_logic(coordinates_df, metadata_df, text_df):
  # see if there are duplicates for the newly created base_studies
  all_base_studies = []
  with db.session.no_autoflush:
- all_studies = {
- s.pmid: s for s in Study.query.filter_by(source="neurosynth").all()
- }
  for metadata_row, text_row in zip(
  metadata_df.itertuples(), text_df.itertuples()
  ):
@@ -561,76 +558,88 @@ def ace_ingestion_logic(coordinates_df, metadata_df, text_df):
 
  # append base study to commit
  to_commit.append(base_study)
+ # keep track of all created/modified base studies
+ all_base_studies.append(base_study)
 
- s = all_studies.get(pmid, Study())
-
- # try to update the study if information is missing
- study_info = {
- "name": metadata_row.title,
- "doi": doi,
- "pmid": pmid,
- "description": text_row.abstract,
- "authors": metadata_row.authors,
- "publication": metadata_row.journal,
- "year": year,
- "level": "group",
- "source": "neurosynth",
- }
- for col, value in study_info.items():
- source_attr = getattr(s, col)
- setattr(s, col, source_attr or value)
+ relevant_studies = base_study.versions
+ if not relevant_studies:
+ relevant_studies.append(Study())
 
- analyses = []
- points = []
+ # if all studies have a user,
+ # add a new study version to incorporate the new data
+ if all(s.user is not None for s in relevant_studies):
+ relevant_studies.append(Study())
 
- try:
- study_coord_data = coordinates_df.loc[[id_]]
- except KeyError:
- print(f"pmid: {id_} has no coordinates")
- continue
- for order, (t_id, df) in enumerate(study_coord_data.groupby("table_id")):
- a = (
- Analysis.query.filter_by(table_id=str(t_id)).one_or_none()
- or Analysis()
- )
- a.name = df["table_label"][0] or str(t_id)
- a.table_id = str(t_id)
- a.order = a.order or order
- a.description = (
- df["table_caption"][0]
- if not df["table_caption"].isna()[0]
- else None
- )
- if not a.study:
- a.study = s
- analyses.append(a)
- point_idx = 0
- for _, p in df.iterrows():
- point = Point(
- x=p["x"],
- y=p["y"],
- z=p["z"],
- space=metadata_row.coordinate_space,
- kind=(
- df["statistic"][0]
- if not df["statistic"].isna()[0]
- else "unknown"
- ),
- analysis=a,
- entities=[Entity(label=a.name, level="group", analysis=a)],
- order=point_idx,
+ for s in relevant_studies:
+ # try to update the study if information is missing
+ study_info = {
+ "name": metadata_row.title,
+ "doi": doi,
+ "pmid": pmid,
+ "description": text_row.abstract,
+ "authors": metadata_row.authors,
+ "publication": metadata_row.journal,
+ "year": year,
+ "level": "group",
+ "source": "neurosynth",
+ }
+ for col, value in study_info.items():
+ source_attr = getattr(s, col)
+ setattr(s, col, source_attr or value)
+ if s.user is not None:
+ # do not edit studies that are user owned
+ continue
+ analyses = []
+ points = []
+
+ try:
+ study_coord_data = coordinates_df.loc[[id_]]
+ except KeyError:
+ print(f"pmid: {id_} has no coordinates")
+ continue
+ for order, (t_id, df) in enumerate(
+ study_coord_data.groupby("table_id")
+ ):
+ a = (
+ Analysis.query.filter_by(table_id=str(t_id)).one_or_none()
+ or Analysis()
  )
- points.append(point)
- point_idx += 1
- to_commit.extend(points)
- to_commit.extend(analyses)
- # append study as version of study
- base_study.versions.append(s)
-
- db.session.add_all(to_commit)
- db.session.flush()
- for bs in all_base_studies:
- bs.update_has_images_and_points()
+ a.name = df["table_label"][0] or str(t_id)
+ a.table_id = str(t_id)
+ a.order = a.order or order
+ a.description = (
+ df["table_caption"][0]
+ if not df["table_caption"].isna()[0]
+ else None
+ )
+ if not a.study:
+ a.study = s
+ analyses.append(a)
+ point_idx = 0
+ for _, p in df.iterrows():
+ point = Point(
+ x=p["x"],
+ y=p["y"],
+ z=p["z"],
+ space=metadata_row.coordinate_space,
+ kind=(
+ df["statistic"][0]
+ if not df["statistic"].isna()[0]
+ else "unknown"
+ ),
+ analysis=a,
+ entities=[Entity(label=a.name, level="group", analysis=a)],
+ order=point_idx,
+ )
+ points.append(point)
+ point_idx += 1
+ to_commit.extend(points)
+ to_commit.extend(analyses)
+
+ db.session.add_all(to_commit)
+ db.session.flush()
+ for bs in all_base_studies:
+ bs.update_has_images_and_points()
  db.session.commit()
 
 

diff --git a/store/neurostore/models/data.py b/store/neurostore/models/data.py
@@ -8,9 +8,10 @@
 from sqlalchemy.ext.mutable import MutableDict
 from sqlalchemy.orm import relationship, backref
 from sqlalchemy.sql import func
+
 import shortuuid
 
-from .migration_types import TSVector
+from .migration_types import TSVector, PGVector
 from ..database import db
 
 
@@ -178,6 +179,7 @@ class BaseStudy(BaseMixin, db.Model):
  has_coordinates = db.Column(db.Boolean, default=False, nullable=False)
  has_images = db.Column(db.Boolean, default=False, nullable=False)
  user_id = db.Column(db.Text, db.ForeignKey("users.external_id"), index=True)
+ ada_openai_vector = db.Column(PGVector(1536)) # length of openai ada vector
  _ts_vector = db.Column(
  "__ts_vector__",
  TSVector(),

diff --git a/store/neurostore/models/migration_types.py b/store/neurostore/models/migration_types.py
@@ -1,5 +1,25 @@
 import sqlalchemy as sa
 from sqlalchemy.dialects.postgresql import TSVECTOR
+from pgvector.sqlalchemy import Vector
+
+
+class PGVector(sa.types.TypeDecorator):
+ """class for semantic search"""
+
+ cache_ok = True
+ impl = Vector
+
+ def __init__(self, dim):
+ super().__init__()
+ self.impl = Vector(dim)
+
+ def process_bind_param(self, value, dialect):
+ # Ensure the value is of the correct type
+ return value
+
+ def process_result_value(self, value, dialect):
+ # Ensure the value is returned correctly
+ return value
 
 
 class TSVector(sa.types.TypeDecorator):

diff --git a/store/neurostore/requirements.txt b/store/neurostore/requirements.txt
@@ -14,6 +14,7 @@ gunicorn~=22.0
 ipython~=7.19
 pandas~=1.2
 pip-chill~=1.0
+pgvector~=0.2.5
 psycopg2-binary~=2.8
 pyld~=2.0
 python-jose~=3.3

diff --git a/store/postgres/Dockerfile b/store/postgres/Dockerfile
@@ -1,11 +1,33 @@
 FROM postgres:12.12
+
+# Install necessary dependencies
 RUN apt-get update && apt-get install -y dos2unix
 RUN apt-get install -yq python3-pip python-dev build-essential
 RUN apt-get install -yq cron
 RUN pip3 install awscli
+
+# Install pgvector dependencies
+RUN apt-get install -y postgresql-server-dev-12 gcc make curl clang
+
+# Download and install pgvector
+RUN curl -L https:/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -o pgvector.tar.gz \
+ && tar -xzf pgvector.tar.gz \
+ && cd pgvector-0.7.0 \
+ && make \
+ && make install \
+ && rm -rf pgvector.tar.gz pgvector-0.7.0
+
+# Copy initialization script
+COPY init-vector-extension.sql /docker-entrypoint-initdb.d/
+
+# Copy scripts and set permissions
 COPY pg_dump-to-s3 /home
 RUN chmod +x /home/pg_dump-to-s3.sh /home/s3-autodelete.sh
+
+# Set up cron jobs
 RUN crontab /home/backup.txt
 RUN service cron start
+
+# Convert scripts to Unix format
 RUN dos2unix /home/pg_dump-to-s3.sh
 RUN dos2unix /home/s3-autodelete.sh
diff --git a/store/postgres/init-vector-extension.sql b/store/postgres/init-vector-extension.sql
@@ -0,0 +1 @@
+CREATE EXTENSION IF NOT EXISTS vector;