feat(ingest): Created a faster ingestion mode - pipeline (#1750)

* Unify pgvector and postgres connection settings * Remove local changes * Update file pgvector->postgres * postgresql should be postgres * Adding pipeline ingestion mode * disable hugging face parallelism. Continue on file to doc transform failure * Semaphore to limit docq async workers. ETA reporting
zylon-ai · Mar 19, 2024 · 134fc54 · 134fc54
1 parent 1efac6a
commit 134fc54
Show file tree

Hide file tree

Showing 5 changed files with 301 additions and 2 deletions.
diff --git a/fern/docs/pages/manual/ingestion.mdx b/fern/docs/pages/manual/ingestion.mdx
@@ -62,6 +62,7 @@ The following ingestion mode exist:
 * `simple`: historic behavior, ingest one document at a time, sequentially
 * `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
 * `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
+* `pipeline`: Alternative to parallel.
 To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
 
 To configure the number of workers used for parallel or batched ingestion, you can use

diff --git a/private_gpt/components/ingest/ingest_component.py b/private_gpt/components/ingest/ingest_component.py
@@ -6,19 +6,21 @@
 import os
 import threading
 from pathlib import Path
+from queue import Queue
 from typing import Any
 
 from llama_index.core.data_structs import IndexDict
 from llama_index.core.embeddings.utils import EmbedType
 from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
 from llama_index.core.indices.base import BaseIndex
 from llama_index.core.ingestion import run_transformations
-from llama_index.core.schema import Document, TransformComponent
+from llama_index.core.schema import BaseNode, Document, TransformComponent
 from llama_index.core.storage import StorageContext
 
 from private_gpt.components.ingest.ingest_helper import IngestionHelper
 from private_gpt.paths import local_data_path
 from private_gpt.settings.settings import Settings
+from private_gpt.utils.eta import eta
 
 logger = logging.getLogger(__name__)
 
@@ -314,6 +316,170 @@ def __del__(self) -> None:
  self._file_to_documents_work_pool.terminate()
 
 
+class PipelineIngestComponent(BaseIngestComponentWithIndex):
+ """Pipeline ingestion - keeping the embedding worker pool as busy as possible.
+
+ This class implements a threaded ingestion pipeline, which comprises two threads
+ and two queues. The primary thread is responsible for reading and parsing files
+ into documents. These documents are then placed into a queue, which is
+ distributed to a pool of worker processes for embedding computation. After
+ embedding, the documents are transferred to another queue where they are
+ accumulated until a threshold is reached. Upon reaching this threshold, the
+ accumulated documents are flushed to the document store, index, and vector
+ store.
+
+ Exception handling ensures robustness against erroneous files. However, in the
+ pipelined design, one error can lead to the discarding of multiple files. Any
+ discarded files will be reported.
+ """
+
+ NODE_FLUSH_COUNT = 5000 # Save the index every # nodes.
+
+ def __init__(
+ self,
+ storage_context: StorageContext,
+ embed_model: EmbedType,
+ transformations: list[TransformComponent],
+ count_workers: int,
+ *args: Any,
+ **kwargs: Any,
+ ) -> None:
+ super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
+ self.count_workers = count_workers
+ assert (
+ len(self.transformations) >= 2
+ ), "Embeddings must be in the transformations"
+ assert count_workers > 0, "count_workers must be > 0"
+ self.count_workers = count_workers
+ # We are doing our own multiprocessing
+ # To do not collide with the multiprocessing of huggingface, we disable it
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+ # doc_q stores parsed files as Document chunks.
+ # Using a shallow queue causes the filesystem parser to block
+ # when it reaches capacity. This ensures it doesn't outpace the
+ # computationally intensive embeddings phase, avoiding unnecessary
+ # memory consumption. The semaphore is used to bound the async worker
+ # embedding computations to cause the doc Q to fill and block.
+ self.doc_semaphore = multiprocessing.Semaphore(
+ self.count_workers
+ ) # limit the doc queue to # items.
+ self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20)
+ # node_q stores documents parsed into nodes (embeddings).
+ # Larger queue size so we don't block the embedding workers during a slow
+ # index update.
+ self.node_q: Queue[
+ tuple[str, str | None, list[Document] | None, list[BaseNode] | None]
+ ] = Queue(40)
+ threading.Thread(target=self._doc_to_node, daemon=True).start()
+ threading.Thread(target=self._write_nodes, daemon=True).start()
+
+ def _doc_to_node(self) -> None:
+ # Parse documents into nodes
+ with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool:
+ while True:
+ try:
+ cmd, file_name, documents = self.doc_q.get(
+ block=True
+ ) # Documents for a file
+ if cmd == "process":
+ # Push CPU/GPU embedding work to the worker pool
+ # Acquire semaphore to control access to worker pool
+ self.doc_semaphore.acquire()
+ pool.apply_async(
+ self._doc_to_node_worker, (file_name, documents)
+ )
+ elif cmd == "quit":
+ break
+ finally:
+ if cmd != "process":
+ self.doc_q.task_done() # unblock Q joins
+
+ def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None:
+ # CPU/GPU intensive work in its own process
+ try:
+ nodes = run_transformations(
+ documents, # type: ignore[arg-type]
+ self.transformations,
+ show_progress=self.show_progress,
+ )
+ self.node_q.put(("process", file_name, documents, nodes))
+ finally:
+ self.doc_semaphore.release()
+ self.doc_q.task_done() # unblock Q joins
+
+ def _save_docs(
+ self, files: list[str], documents: list[Document], nodes: list[BaseNode]
+ ) -> None:
+ try:
+ logger.info(
+ f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)"
+ )
+ self._index.insert_nodes(nodes)
+ for document in documents:
+ self._index.docstore.set_document_hash(
+ document.get_doc_id(), document.hash
+ )
+ self._save_index()
+ except Exception:
+ # Tell the user so they can investigate these files
+ logger.exception(f"Processing files {files}")
+ finally:
+ # Clearing work, even on exception, maintains a clean state.
+ nodes.clear()
+ documents.clear()
+ files.clear()
+
+ def _write_nodes(self) -> None:
+ # Save nodes to index. I/O intensive.
+ node_stack: list[BaseNode] = []
+ doc_stack: list[Document] = []
+ file_stack: list[str] = []
+ while True:
+ try:
+ cmd, file_name, documents, nodes = self.node_q.get(block=True)
+ if cmd in ("flush", "quit"):
+ if file_stack:
+ self._save_docs(file_stack, doc_stack, node_stack)
+ if cmd == "quit":
+ break
+ elif cmd == "process":
+ node_stack.extend(nodes) # type: ignore[arg-type]
+ doc_stack.extend(documents) # type: ignore[arg-type]
+ file_stack.append(file_name) # type: ignore[arg-type]
+ # Constant saving is heavy on I/O - accumulate to a threshold
+ if len(node_stack) >= self.NODE_FLUSH_COUNT:
+ self._save_docs(file_stack, doc_stack, node_stack)
+ finally:
+ self.node_q.task_done()
+
+ def _flush(self) -> None:
+ self.doc_q.put(("flush", None, None))
+ self.doc_q.join()
+ self.node_q.put(("flush", None, None, None))
+ self.node_q.join()
+
+ def ingest(self, file_name: str, file_data: Path) -> list[Document]:
+ documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
+ self.doc_q.put(("process", file_name, documents))
+ self._flush()
+ return documents
+
+ def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
+ docs = []
+ for file_name, file_data in eta(files):
+ try:
+ documents = IngestionHelper.transform_file_into_documents(
+ file_name, file_data
+ )
+ self.doc_q.put(("process", file_name, documents))
+ docs.extend(documents)
+ except Exception:
+ logger.exception(f"Skipping {file_data.name}")
+ self._flush()
+ return docs
+
+
 def get_ingestion_component(
  storage_context: StorageContext,
  embed_model: EmbedType,
@@ -336,6 +502,13 @@ def get_ingestion_component(
  transformations=transformations,
  count_workers=settings.embedding.count_workers,
  )
+ elif ingest_mode == "pipeline":
+ return PipelineIngestComponent(
+ storage_context=storage_context,
+ embed_model=embed_model,
+ transformations=transformations,
+ count_workers=settings.embedding.count_workers,
+ )
  else:
  return SimpleIngestComponent(
  storage_context=storage_context,

diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
@@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel):
 
 class EmbeddingSettings(BaseModel):
  mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
- ingest_mode: Literal["simple", "batch", "parallel"] = Field(
+ ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field(
  "simple",
  description=(
  "The ingest mode to use for the embedding engine:\n"
  "If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
  "If `batch` - if multiple files, parse all the files in parallel, "
  "and send them in batch to the embedding model.\n"
+ "In `pipeline` - The Embedding engine is kept as busy as possible\n"
  "If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
  "`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
  "For modes that leverage parallelization, you can specify the number of "
@@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel):
  "The number of workers to use for file ingestion.\n"
  "In `batch` mode, this is the number of workers used to parse the files.\n"
  "In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
+ "In `pipeline` mode, this is the number of workers that can perform embeddings.\n"
  "This is only used if `ingest_mode` is not `simple`.\n"
  "Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
  "Do not set it higher than your number of threads of your CPU."

diff --git a/private_gpt/utils/eta.py b/private_gpt/utils/eta.py
@@ -0,0 +1,122 @@
+import datetime
+import logging
+import math
+import time
+from collections import deque
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def human_time(*args: Any, **kwargs: Any) -> str:
+ def timedelta_total_seconds(timedelta: datetime.timedelta) -> float:
+ return (
+ timedelta.microseconds
+ + 0.0
+ + (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6
+ ) / 10**6
+
+ secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs)))
+ # We want (ms) precision below 2 seconds
+ if secs < 2:
+ return f"{secs * 1000}ms"
+ units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)]
+ parts = []
+ for unit, mul in units:
+ if secs / mul >= 1 or mul == 1:
+ if mul > 1:
+ n = int(math.floor(secs / mul))
+ secs -= n * mul
+ else:
+ # >2s we drop the (ms) component.
+ n = int(secs)
+ if n:
+ parts.append(f"{n}{unit}")
+ return " ".join(parts)
+
+
+def eta(iterator: list[Any]) -> Any:
+ """Report an ETA after 30s and every 60s thereafter."""
+ total = len(iterator)
+ _eta = ETA(total)
+ _eta.needReport(30)
+ for processed, data in enumerate(iterator, start=1):
+ yield data
+ _eta.update(processed)
+ if _eta.needReport(60):
+ logger.info(f"{processed}/{total} - ETA {_eta.human_time()}")
+
+
+class ETA:
+ """Predict how long something will take to complete."""
+
+ def __init__(self, total: int):
+ self.total: int = total # Total expected records.
+ self.rate: float = 0.0 # per second
+ self._timing_data: deque[tuple[float, int]] = deque(maxlen=100)
+ self.secondsLeft: float = 0.0
+ self.nexttime: float = 0.0
+
+ def human_time(self) -> str:
+ if self._calc():
+ return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min"
+ return "(computing)"
+
+ def update(self, count: int) -> None:
+ # count should be in the range 0 to self.total
+ assert count > 0
+ assert count <= self.total
+ self._timing_data.append((time.time(), count)) # (X,Y) for pearson
+
+ def needReport(self, whenSecs: int) -> bool:
+ now = time.time()
+ if now > self.nexttime:
+ self.nexttime = now + whenSecs
+ return True
+ return False
+
+ def _calc(self) -> bool:
+ # A sample before a prediction. Need two points to compute slope!
+ if len(self._timing_data) < 3:
+ return False
+
+ # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
+ # Calculate means and standard deviations.
+ samples = len(self._timing_data)
+ # column wise sum of the timing tuples to compute their mean.
+ mean_x, mean_y = (
+ sum(i) / samples for i in zip(*self._timing_data, strict=False)
+ )
+ std_x = math.sqrt(
+ sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1)
+ )
+ std_y = math.sqrt(
+ sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1)
+ )
+
+ # Calculate coefficient.
+ sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0
+ for x, y in self._timing_data:
+ x -= mean_x
+ y -= mean_y
+ sum_xy += x * y
+ sum_sq_v_x += pow(x, 2)
+ sum_sq_v_y += pow(y, 2)
+ pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y)
+
+ # Calculate regression line.
+ # y = mx + b where m is the slope and b is the y-intercept.
+ m = self.rate = pearson_r * (std_y / std_x)
+ y = self.total
+ b = mean_y - m * mean_x
+ x = (y - b) / m
+
+ # Calculate fitted line (transformed/shifted regression line horizontally).
+ fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0])
+ fitted_x = (y - fitted_b) / m
+ _, count = self._timing_data[-1] # adjust last data point progress count
+ adjusted_x = ((fitted_x - x) * (count / self.total)) + x
+ eta_epoch = adjusted_x
+
+ self.secondsLeft = max([eta_epoch - time.time(), 0])
+ return True
diff --git a/settings-local.yaml b/settings-local.yaml
@@ -1,3 +1,4 @@
+# poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface"
 server:
  env_name: ${APP_ENV:local}