various fixes

- Save history parts under the history type to differentiate them from notes. - lower case the metadata keys from the Markdown header to make them easier to use and fixes a bug bewteen url and Url by the way. - save the `Date` metadata from the Markdown files as `created_at` metadata using a datetime object. - generalized datetime management. - more robust read and write of json files regarding date fields. - added a Referer header for history files. - fix similarity.py to accept multiple parameters.
flepied · Sep 13, 2023 · 5d1efae · 5d1efae
1 parent 4ac8f8e
commit 5d1efae
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ The system takes as input a directory where you store your markdown notes. For e
 
 ```mermaid
 graph TD
-A[Markdown files from Obsidian]-->B[Text files from markdown and pointers]-->C[Text Chunks]-->D[Vector Database]-->E[Second Brain AI Agent]
+A[Markdown files from your editor]-->B[Text files from markdown and pointers]-->C[Text Chunks]-->D[Vector Database]-->E[Second Brain AI Agent]
 ```
 
 From a markdown file, [transform_md.py](transform_md.py) extracts the text from the markdown file, then from the links inside the markdown file it extracts pdf, url, youtube video and transforms them into text. There is some support to extract history data from the markdown files: if there is an `## History` section or the file name contains `History`, the file is split in multiple parts according to `<day> <month> <year>` sections like `### 10 Sep 2023`.

diff --git a/integration-test.sh b/integration-test.sh
@@ -50,6 +50,10 @@ if grep -q "I don't know." <<< "$RES"; then
  exit 1
 fi
 
+# wait a bit to be sure to have all the logs in different seconds
+# for the vacuum cleaning process to work
+sleep 2
+
 # test changing a document but not its content
 sudo journalctl -u sba-md --rotate
 sudo journalctl -u sba-md --vacuum-time=1s

diff --git a/lib.py b/lib.py
@@ -198,7 +198,7 @@ class DateTimeEncoder(json.JSONEncoder):
  "Encode datetime objects to json"
 
  def default(self, o):
- "Encode datetime objects to json"
+ "Encode datetime objects to json as isoformat strings"
  if isinstance(o, datetime.datetime):
  return o.isoformat()
  return super().default(o)
@@ -208,9 +208,12 @@ def default(self, o):
 def datetime_decoder(dct):
  "Decode datetime objects from json"
  for key, val in dct.items():
- if key == "last_accessed_at":
+ if key.endswith("_at") or key.find("date") != -1:
  try:
- dct[key] = datetime.datetime.fromisoformat(val)
+ if isinstance(val, str):
+ dct[key] = datetime.datetime.fromisoformat(val)
+ elif isinstance(val, float):
+ dct[key] = datetime.datetime.fromtimestamp(val)
  except ValueError:
  pass # Not a valid datetime string, leave as is
  return dct

diff --git a/similarity.py b/similarity.py
@@ -11,6 +11,15 @@
 from lib import get_vectorstore
 
 
+def split_filter(args):
+ "Split the filter arguments into an and query for chromadb"
+ if len(args) == 0:
+ return {}
+ if len(args) == 1:
+ return dict([args[0].split("=", 1)])
+ return {"$and": [{arg.split("=", 1)[0]: arg.split("=", 1)[1]} for arg in args]}
+
+
 def main(query, **kwargs):
  "Entry point"
  vector_store = get_vectorstore()
@@ -21,7 +30,6 @@ def main(query, **kwargs):
 
 if __name__ == "__main__":
  load_dotenv()
- args = [arg.split("=", 1) for arg in sys.argv[2:]]
- main(sys.argv[1], filter=dict(args))
+ main(sys.argv[1], where=split_filter(sys.argv[2:]))
 
 # similarity.py ends here
diff --git a/transform_md.py b/transform_md.py
@@ -43,13 +43,16 @@ def save_content(file_path, text, check_content=True, **metadata):
  "save the text and metatada into a json file"
  if check_content:
  try:
+ print(f"reading {file_path}", file=sys.stderr)
  with open(file_path, "r", encoding="utf-8") as in_f:
  data = json.load(in_f)
  if data["text"] == text:
  print(f"content is the same for {file_path}", file=sys.stderr)
  return False
  except FileNotFoundError:
  pass
+ except json.decoder.JSONDecodeError as exc:
+ print(f"invalid json file {file_path}: {exc}", file=sys.stderr)
  print(f"writing {file_path} metadata={metadata}", file=sys.stderr)
  data = {"text": text, "metadata": metadata}
  with open(file_path, "w", encoding="utf-8") as out_f:
@@ -201,11 +204,21 @@ def get_metadata(content):
  for idx, line in enumerate(lines):
  header = line.split(":", 1)
  if len(header) == 2:
- metadata[header[0].strip()] = header[1].strip()
+ metadata[header[0].strip().lower()] = header[1].strip()
  continue
  if line in ("---", "", "..."):
  continue
  break
+ if "date" in metadata:
+ # transform date to a date object and save is as created_at
+ # because langchain uses that field
+ try:
+ metadata["created_at"] = datetime.datetime.strptime(
+ metadata["date"], "%Y/%m/%d %H:%M"
+ )
+ del metadata["date"]
+ except ValueError:
+ pass
  content = "\n".join(lines[idx:])
  return metadata, content
 
@@ -220,6 +233,18 @@ def remove_dash(content, level):
  return "\n".join(lines)
 
 
+def get_date(date_str):
+ "Get the date from a string trying different formats: 01 Jan 2020 then 01 January 2020"
+ try:
+ return datetime.datetime.strptime(date_str, "%d %B %Y")
+ except ValueError:
+ try:
+ return datetime.datetime.strptime(date_str, "%d %b %Y")
+ except ValueError:
+ print(f"Unable to parse date {date_str}", file=sys.stderr)
+ return date_str
+
+
 DATE2_REGEXP = re.compile(r"^## (\d\d \w+ \d\d\d\d)", re.MULTILINE)
 DATE3_REGEXP = re.compile(r"^### (\d\d \w+ \d\d\d\d)", re.MULTILINE)
 
@@ -247,13 +272,16 @@ def split_md_file(fname, md_dir):
  stat = os.stat(fname)
  os.utime(base_fname, (stat.st_atime, stat.st_mtime))
  for idx in range(1, len(history), 2):
- history_date = datetime.datetime.strptime(history[idx], "%d %b %Y")
+ history_date = get_date(history[idx])
+ if isinstance(history_date, str):
+ continue
  if level == 1:
  date = history_date.strftime("%d")
  else:
  date = history_date.strftime("%Y%m%d")
  part_fname = os.path.join(md_dir, basename + date + ".md")
  with open(part_fname, "w", encoding="UTF-8") as fptr:
+ fptr.write(f"---\nReferer: {basename}\n---\n\n")
  fptr.write("# " + history[idx] + remove_dash(history[idx + 1], level))
  mtime = (history_date + datetime.timedelta(hours=12)).timestamp()
  os.utime(part_fname, (mtime, mtime))
@@ -274,8 +302,10 @@ def write_output_file(md_file, out_dir, metadata):
  metadata, content = get_metadata(output.page_content)
  metadata["type"] = "notes"
  else:
- content = output.page_content
- metadata["last_accessed_at"] = (last_accessed_at,)
+ new_metadata, content = get_metadata(output.page_content)
+ metadata.update(new_metadata)
+ metadata["type"] = "history"
+ metadata["last_accessed_at"] = last_accessed_at
  if "url" not in metadata:
  metadata["url"] = f"file://{md_file}"
  print(f"saving {md_file=} with {metadata=}", file=sys.stderr)
@@ -307,7 +337,6 @@ def process_md_file(fname, out_dir, checksum_store):
  return False
  basename = os.path.basename(fname[:-3])
  oname = get_output_file_path(out_dir, basename)
- stat = os.stat(fname)
  if is_same_time(fname, oname):
  print(f"skipping {fname} as there is no time change", file=sys.stderr)
  return False
@@ -321,8 +350,6 @@ def process_md_file(fname, out_dir, checksum_store):
  print(f"skipping {fname} as content did not change", file=sys.stderr)
  return False
  print(f"processed '{fname}'", file=sys.stderr)
- # set the timestamp to be the same
- os.utime(oname, (stat.st_atime, stat.st_mtime))
  return True
 
 

diff --git a/transform_txt.py b/transform_txt.py
@@ -5,6 +5,7 @@
 into vector embeddings and store the vectors in a vector database.
 """
 
+import datetime
 import json
 import os
 import sys
@@ -54,12 +55,21 @@ def validate_and_extract_url(fname, basename, out_dir):
  if is_same_time(fname, oname):
  return False, None
  with open(fname, encoding="utf-8") as in_stream:
- data = json.load(in_stream, object_hook=datetime_decoder)
+ try:
+ data = json.load(in_stream, object_hook=datetime_decoder)
+ except json.JSONDecodeError as exc:
+ print(f"Could not parse {fname}: {exc}", file=sys.stderr)
+ return False, None
+ if "metadata" not in data:
+ print(f"Could not find metadata in {fname}", file=sys.stderr)
+ return False, None
  metadata = data["metadata"]
  # convert the datetime to timestamp because chromadb does not
  # support datetime
- if "last_accessed_at" in metadata:
- metadata["last_accessed_at"] = metadata["last_accessed_at"].timestamp()
+ for key, val in metadata.items():
+ # check if the value is a datetime
+ if isinstance(val, datetime.datetime):
+ metadata[key] = val.timestamp()
  return metadata, data["text"]