Skip to content

Commit

Permalink
various fixes
Browse files Browse the repository at this point in the history
- Save history parts under the history type to differentiate them from notes.
- lower case the metadata keys from the Markdown header to make them easier to use and fixes a bug bewteen url and Url by the way.
- save the `Date` metadata from the Markdown files as `created_at` metadata using a datetime object.
- generalized datetime management.
- more robust read and write of json files regarding date fields.
- added a Referer header for history files.
- fix similarity.py to accept multiple parameters.
  • Loading branch information
flepied committed Sep 13, 2023
1 parent 4ac8f8e commit 5d1efae
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 16 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ The system takes as input a directory where you store your markdown notes. For e

```mermaid
graph TD
A[Markdown files from Obsidian]-->B[Text files from markdown and pointers]-->C[Text Chunks]-->D[Vector Database]-->E[Second Brain AI Agent]
A[Markdown files from your editor]-->B[Text files from markdown and pointers]-->C[Text Chunks]-->D[Vector Database]-->E[Second Brain AI Agent]
```

From a markdown file, [transform_md.py](transform_md.py) extracts the text from the markdown file, then from the links inside the markdown file it extracts pdf, url, youtube video and transforms them into text. There is some support to extract history data from the markdown files: if there is an `## History` section or the file name contains `History`, the file is split in multiple parts according to `<day> <month> <year>` sections like `### 10 Sep 2023`.
Expand Down
4 changes: 4 additions & 0 deletions integration-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ if grep -q "I don't know." <<< "$RES"; then
exit 1
fi

# wait a bit to be sure to have all the logs in different seconds
# for the vacuum cleaning process to work
sleep 2

# test changing a document but not its content
sudo journalctl -u sba-md --rotate
sudo journalctl -u sba-md --vacuum-time=1s
Expand Down
9 changes: 6 additions & 3 deletions lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ class DateTimeEncoder(json.JSONEncoder):
"Encode datetime objects to json"

def default(self, o):
"Encode datetime objects to json"
"Encode datetime objects to json as isoformat strings"
if isinstance(o, datetime.datetime):
return o.isoformat()
return super().default(o)
Expand All @@ -208,9 +208,12 @@ def default(self, o):
def datetime_decoder(dct):
"Decode datetime objects from json"
for key, val in dct.items():
if key == "last_accessed_at":
if key.endswith("_at") or key.find("date") != -1:
try:
dct[key] = datetime.datetime.fromisoformat(val)
if isinstance(val, str):
dct[key] = datetime.datetime.fromisoformat(val)
elif isinstance(val, float):
dct[key] = datetime.datetime.fromtimestamp(val)
except ValueError:
pass # Not a valid datetime string, leave as is
return dct
Expand Down
12 changes: 10 additions & 2 deletions similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,15 @@
from lib import get_vectorstore


def split_filter(args):
"Split the filter arguments into an and query for chromadb"
if len(args) == 0:
return {}
if len(args) == 1:
return dict([args[0].split("=", 1)])
return {"$and": [{arg.split("=", 1)[0]: arg.split("=", 1)[1]} for arg in args]}


def main(query, **kwargs):
"Entry point"
vector_store = get_vectorstore()
Expand All @@ -21,7 +30,6 @@ def main(query, **kwargs):

if __name__ == "__main__":
load_dotenv()
args = [arg.split("=", 1) for arg in sys.argv[2:]]
main(sys.argv[1], filter=dict(args))
main(sys.argv[1], where=split_filter(sys.argv[2:]))

# similarity.py ends here
41 changes: 34 additions & 7 deletions transform_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,16 @@ def save_content(file_path, text, check_content=True, **metadata):
"save the text and metatada into a json file"
if check_content:
try:
print(f"reading {file_path}", file=sys.stderr)
with open(file_path, "r", encoding="utf-8") as in_f:
data = json.load(in_f)
if data["text"] == text:
print(f"content is the same for {file_path}", file=sys.stderr)
return False
except FileNotFoundError:
pass
except json.decoder.JSONDecodeError as exc:
print(f"invalid json file {file_path}: {exc}", file=sys.stderr)
print(f"writing {file_path} metadata={metadata}", file=sys.stderr)
data = {"text": text, "metadata": metadata}
with open(file_path, "w", encoding="utf-8") as out_f:
Expand Down Expand Up @@ -201,11 +204,21 @@ def get_metadata(content):
for idx, line in enumerate(lines):
header = line.split(":", 1)
if len(header) == 2:
metadata[header[0].strip()] = header[1].strip()
metadata[header[0].strip().lower()] = header[1].strip()
continue
if line in ("---", "", "..."):
continue
break
if "date" in metadata:
# transform date to a date object and save is as created_at
# because langchain uses that field
try:
metadata["created_at"] = datetime.datetime.strptime(
metadata["date"], "%Y/%m/%d %H:%M"
)
del metadata["date"]
except ValueError:
pass
content = "\n".join(lines[idx:])
return metadata, content

Expand All @@ -220,6 +233,18 @@ def remove_dash(content, level):
return "\n".join(lines)


def get_date(date_str):
"Get the date from a string trying different formats: 01 Jan 2020 then 01 January 2020"
try:
return datetime.datetime.strptime(date_str, "%d %B %Y")
except ValueError:
try:
return datetime.datetime.strptime(date_str, "%d %b %Y")
except ValueError:
print(f"Unable to parse date {date_str}", file=sys.stderr)
return date_str


DATE2_REGEXP = re.compile(r"^## (\d\d \w+ \d\d\d\d)", re.MULTILINE)
DATE3_REGEXP = re.compile(r"^### (\d\d \w+ \d\d\d\d)", re.MULTILINE)

Expand Down Expand Up @@ -247,13 +272,16 @@ def split_md_file(fname, md_dir):
stat = os.stat(fname)
os.utime(base_fname, (stat.st_atime, stat.st_mtime))
for idx in range(1, len(history), 2):
history_date = datetime.datetime.strptime(history[idx], "%d %b %Y")
history_date = get_date(history[idx])
if isinstance(history_date, str):
continue
if level == 1:
date = history_date.strftime("%d")
else:
date = history_date.strftime("%Y%m%d")
part_fname = os.path.join(md_dir, basename + date + ".md")
with open(part_fname, "w", encoding="UTF-8") as fptr:
fptr.write(f"---\nReferer: {basename}\n---\n\n")
fptr.write("# " + history[idx] + remove_dash(history[idx + 1], level))
mtime = (history_date + datetime.timedelta(hours=12)).timestamp()
os.utime(part_fname, (mtime, mtime))
Expand All @@ -274,8 +302,10 @@ def write_output_file(md_file, out_dir, metadata):
metadata, content = get_metadata(output.page_content)
metadata["type"] = "notes"
else:
content = output.page_content
metadata["last_accessed_at"] = (last_accessed_at,)
new_metadata, content = get_metadata(output.page_content)
metadata.update(new_metadata)
metadata["type"] = "history"
metadata["last_accessed_at"] = last_accessed_at
if "url" not in metadata:
metadata["url"] = f"file://{md_file}"
print(f"saving {md_file=} with {metadata=}", file=sys.stderr)
Expand Down Expand Up @@ -307,7 +337,6 @@ def process_md_file(fname, out_dir, checksum_store):
return False
basename = os.path.basename(fname[:-3])
oname = get_output_file_path(out_dir, basename)
stat = os.stat(fname)
if is_same_time(fname, oname):
print(f"skipping {fname} as there is no time change", file=sys.stderr)
return False
Expand All @@ -321,8 +350,6 @@ def process_md_file(fname, out_dir, checksum_store):
print(f"skipping {fname} as content did not change", file=sys.stderr)
return False
print(f"processed '{fname}'", file=sys.stderr)
# set the timestamp to be the same
os.utime(oname, (stat.st_atime, stat.st_mtime))
return True


Expand Down
16 changes: 13 additions & 3 deletions transform_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
into vector embeddings and store the vectors in a vector database.
"""

import datetime
import json
import os
import sys
Expand Down Expand Up @@ -54,12 +55,21 @@ def validate_and_extract_url(fname, basename, out_dir):
if is_same_time(fname, oname):
return False, None
with open(fname, encoding="utf-8") as in_stream:
data = json.load(in_stream, object_hook=datetime_decoder)
try:
data = json.load(in_stream, object_hook=datetime_decoder)
except json.JSONDecodeError as exc:
print(f"Could not parse {fname}: {exc}", file=sys.stderr)
return False, None
if "metadata" not in data:
print(f"Could not find metadata in {fname}", file=sys.stderr)
return False, None
metadata = data["metadata"]
# convert the datetime to timestamp because chromadb does not
# support datetime
if "last_accessed_at" in metadata:
metadata["last_accessed_at"] = metadata["last_accessed_at"].timestamp()
for key, val in metadata.items():
# check if the value is a datetime
if isinstance(val, datetime.datetime):
metadata[key] = val.timestamp()
return metadata, data["text"]


Expand Down

0 comments on commit 5d1efae

Please sign in to comment.