fix: add debug mode, utf-8 for koreader metadata files, revamp the se…

…ction alignment logic for epubs to use title or nav anchor.
codito · Dec 23, 2023 · e97cbee · e97cbee
1 parent 7b5cded
commit e97cbee
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -3,15 +3,16 @@
 **TL;DR** noted is a command line app to liberate your highlights and notes.
 
 Noted extracts annotations embedded into documents (pdf), or collects them from
-readers (kindle). It tries to align them with the chapters and context to
+readers (kindle, or koreader). It tries to align them with the chapters and context to
 produce a plain text markdown file.
 
 ## Features
 
 **✓** Extracts annotations (highlights and notes) for documents and books 
 **✓** Extracts _context_ and _chapter headings_ along with the annotations 
-**✓** Supports `pdf` and `mobi` files 
+**✓** Supports `pdf`, `epub` and `mobi` files 
 **✓** Detects kindle `My Clippings.txt` files 
+**✓** Detects koreader `*.sdr` directories 
 **✓** Saves all the extracted information in markdown text
 
 ## Installation
@@ -85,7 +86,7 @@ Completed in 1.71s.
 ## Roadmap
 
 - Support for `kfx` and `azw3` files in kindle
-- Support for additional readers like `koreader`, `kobo` etc.
+- Support for additional readers like `kobo` etc.
 
 Contributions in any form e.g. bug reports, feature requests or PRs are most
 welcome!

diff --git a/src/Noted/Core/ExtractWorkflow.cs b/src/Noted/Core/ExtractWorkflow.cs
@@ -74,7 +74,9 @@ public async Task<int> RunAsync(Configuration configuration)
  continue;
  }
 
- // TODO extract file scoped external annotations (KOReader)
+ // Extract if the document has _external_ annotations (kindle, koreader) or
+ // can have _embedded_ annotations (like pdf).
+ // FIXME
  this.Raise(new ExtractionStartedEventArgs { FileName = file });
  await using var stream = this.fileSystem.OpenPathForRead(file);
  var document = await reader.Read(

diff --git a/src/Noted/Extensions/Libraries/KOReader/KOReaderAnnotationProvider.cs b/src/Noted/Extensions/Libraries/KOReader/KOReaderAnnotationProvider.cs
@@ -7,6 +7,7 @@ namespace Noted.Extensions.Libraries.KOReader;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
+using System.Text;
 using NLua;
 using Noted.Core.Extensions;
 using Noted.Core.Models;
@@ -34,8 +35,14 @@ public IEnumerable<Annotation> GetAnnotations(string sourcePath)
  foreach (var annotation in annotationFiles)
  {
  using var lua = new Lua();
+ lua.State.Encoding = Encoding.UTF8;
  var annotationTable = GetLuaTable(lua, lua.DoFile(annotation)[0]);
- var bookmarksTable = GetLuaTable(lua, annotationTable["bookmarks"]);
+ if (!annotationTable.TryGetValue("bookmarks", out var bookmarkNode) || bookmarkNode == null)
+ {
+ continue;
+ }
+
+ var bookmarksTable = GetLuaTable(lua, bookmarkNode);
  var highlightTable = GetLuaTable(lua, annotationTable["highlight"]);
  var highlights = highlightTable.Values
  .SelectMany(h => GetLuaTable(lua, h).Values)
@@ -66,9 +73,10 @@ public IEnumerable<Annotation> GetAnnotations(string sourcePath)
  var pos0 = bookmarkDict["pos0"].ToString();
  var pos1 = bookmarkDict["pos1"].ToString();
  bookmarkDict.TryGetValue("chapter", out var chapterTitle);
+ var epubXPath = new EpubXPathLocation(pos0!, pos1!);
  var context = new AnnotationContext()
  {
- SerializedLocation = new EpubXPathLocation(pos0!, pos1!).ToString(),
+ SerializedLocation = epubXPath.ToString(),
  DocumentSection = new DocumentSection(chapterTitle?.ToString() ?? string.Empty, 0, 0, null)
  };
  yield return new Annotation(

diff --git a/src/Noted/Extensions/Readers/EpubReader.cs b/src/Noted/Extensions/Readers/EpubReader.cs
@@ -4,13 +4,15 @@
 namespace Noted.Extensions.Readers
 {
  using System;
+ using System.CodeDom;
  using System.Collections.Generic;
  using System.IO;
  using System.Linq;
  using System.Threading.Tasks;
  using AngleSharp.Dom;
  using AngleSharp.Html.Parser;
  using AngleSharp.XPath;
+ using MiscUtil.Xml.Linq.Extensions;
  using Noted.Core.Extensions;
  using Noted.Core.Models;
  using Noted.Core.Platform.IO;
@@ -42,27 +44,36 @@ public async Task<Document> Read(
  Annotation: a))
  .OrderBy(p => p.Location.Start.DocumentFragmentId)
  .ToList();
+ if (externalAnnotations.Count == 0)
+ {
+ return new Document { Title = docRef.Title, Author = docRef.Author };
+ }
+
  var sections = ParseNavigation(epub);
  var content = new Dictionary<int, string>();
  var parser = new HtmlParser(new HtmlParserOptions
  {
  IsKeepingSourceReferences = true
  });
+ var annotationIndex = 0;
  foreach (var annotationTuple in externalAnnotations)
  {
- var docIndex = annotationTuple.Location.Start.DocumentFragmentId;
- var document = await parser.ParseDocumentAsync(epub.ReadingOrder[docIndex - 1].Content);
+ var docIndex = annotationTuple.Location.Start.DocumentFragmentId - 1;
+ var document = await parser.ParseDocumentAsync(epub.ReadingOrder[docIndex].Content);
 
  var annotation = annotationTuple.Annotation;
  var allNodesInDocument = document.Body.SelectNodes("//*");
  var startNode = document.Body.SelectSingleNode($"/{annotationTuple.Location.Start.XPath}");
  var endNode = document.Body.SelectSingleNode($"/{annotationTuple.Location.End.XPath}");
 
  var context = GetContext(allNodesInDocument, startNode, endNode);
- annotation.Context.DocumentSection = sections[annotation.Context.DocumentSection!.Title];
- annotation.Context.Location = ((docIndex - 1) * 1000) + context.Item1;
+ annotation.Context.DocumentSection = GetSectionForAnnotation(epub, sections, docIndex, annotation.Context.DocumentSection!.Title, startNode);
+ annotation.Context.Location = ((docIndex - 1) * 1000) +
+ context.Item1 == -1 ? annotationIndex : context.Item1;
  annotation.Context.Content = context.Item2;
  annotations.Add(annotation);
+
+ annotationIndex++;
  }
 
  var sortedSections = sections.Values.OrderBy(s => s.Location).ToList();
@@ -77,8 +88,13 @@ public async Task<Document> Read(
 
  private static Tuple<int, string> GetContext(List<INode> allNodes, INode start, INode end)
  {
- var startSelector = start.ParentElement!.GetSelector();
- var endSelector = end.ParentElement!.GetSelector();
+ var startSelector = start?.ParentElement?.GetSelector();
+ var endSelector = end?.ParentElement?.GetSelector();
+ if (startSelector == null || endSelector == null)
+ {
+ return new(-1, string.Empty);
+ }
+
  var nodesBetween = new List<string>();
  var startLocation = 0;
 
@@ -111,18 +127,58 @@ private static Tuple<int, string> GetContext(List<INode> allNodes, INode start,
  return new(startLocation, string.Join(Environment.NewLine, nodesBetween));
  }
 
+ private static DocumentSection GetSectionForAnnotation(EpubBook epub, Dictionary<string, DocumentSection> sections, int docFragmentId, string title, INode startPath)
+ {
+ // Strategy 1: locate the section by title
+ // For older epub documents which don't have well formatted toc navigation.
+ if (startPath == null)
+ {
+ return sections.Where(s => s.Key.Contains(title)).FirstOrDefault().Value;
+ }
+
+ // Strategy 2: locate the section by anchor from nav element
+ // For epub 3 etc. with well formatted nav.
+ foreach (var anc in startPath.GetAncestors())
+ {
+ var sectionAnchor = (anc as IElement)?.Id ?? string.Empty;
+ var key = $"{epub.ReadingOrder[docFragmentId].FilePath}-{sectionAnchor}-{title ?? string.Empty}";
+
+ if (sections.TryGetValue(key, out var section))
+ {
+ return section;
+ }
+ }
+
+ // Strategy 3: fallback to title match.
+ if (!string.IsNullOrEmpty(title))
+ {
+ return sections.Where(s => s.Key.Contains(title)).FirstOrDefault().Value;
+ }
+
+ return null!;
+ }
+
  private static void NavigationDfs(
  EpubNavigationItem root,
  Dictionary<string, DocumentSection> result,
  DocumentSection parent,
  int level,
  ref int index)
  {
- var rootSection = new DocumentSection(root.Title, level, ++index * 1000, parent);
- result.Add(root.Title, rootSection);
+ // Skip adding HEADER type nodes as toc
+ var rootSection = root.Type == EpubNavigationItemType.HEADER ? null
+ : new DocumentSection(root.Title, level, ++index * 1000, parent);
+ if (rootSection != null)
+ {
+ // Using an unique combination because the docFragmentId and title can be unique
+ // in a specific book. Just title may not be unique.
+ // For example: text/foo.xhtml-II
+ result.Add($"{root.Link?.ContentFilePath ?? string.Empty}-{root.Link?.Anchor ?? string.Empty}-{root.Title}", rootSection);
+ }
+
  foreach (var nestedItem in root.NestedItems)
  {
- NavigationDfs(nestedItem, result, rootSection, level + 1, ref index);
+ NavigationDfs(nestedItem, result, rootSection!, level + 1, ref index);
  }
  }
 

diff --git a/src/Noted/Program.cs b/src/Noted/Program.cs
@@ -5,6 +5,7 @@ namespace Noted
 {
  using System;
  using System.Collections.Generic;
+ using System.Diagnostics;
  using System.Threading.Tasks;
  using Noted.Core;
  using Noted.Core.Extensions;
@@ -39,11 +40,28 @@ public static async Task<int> Main(string[] args)
  var workflows = new Dictionary<string, Func<Configuration, IWorkflow>>
  { { "extract", config => new ExtractWorkflow(config.FileSystem, config.Logger) } };
 
+ WaitForDebuggerIfEnabled();
+
  return await new ConsoleInterface()
  .WithArguments(args)
  .WithConfigurationProvider(configurationProvider)
  .WithWorkflows(workflows)
  .RunAsync();
  }
+
+ private static void WaitForDebuggerIfEnabled()
+ {
+ var debugEnv = Environment.GetEnvironmentVariable("NOTED_DEBUG");
+ if (string.IsNullOrEmpty(debugEnv))
+ {
+ return;
+ }
+
+ System.Console.WriteLine($"Please attach debugger to pid: {Environment.ProcessId}");
+ while (!Debugger.IsAttached)
+ {
+ Debugger.Break();
+ }
+ }
  }
 }
diff --git a/test/Noted.Tests/Extensions/Readers/EpubReaderTests.cs b/test/Noted.Tests/Extensions/Readers/EpubReaderTests.cs
@@ -30,7 +30,7 @@ public async Task EpubReaderShouldParseDocumentWithSections()
  var document = await new EpubReader(this.logger).Read(stream, new Noted.Core.Extensions.ReaderOptions(), (_) => []);
 
  Assert.AreEqual(0, document.Annotations.Count());
- Assert.AreEqual(35, document.Sections.Count());
+ Assert.AreEqual(0, document.Sections.Count());
  Assert.AreEqual("The Prophet", document.Title);
  Assert.AreEqual("Khalil Gibran", document.Author);
  }
@@ -57,5 +57,31 @@ public async Task EpubReaderShouldParseDocumentWithAnnotations()
  Assert.AreEqual("On Giving", annotations[0].Context.DocumentSection.Title);
  Assert.AreNotEqual(0, annotations[0].Context.DocumentSection.Location);
  Assert.AreNotEqual(0, annotations[0].Context.Location);
+ Assert.AreEqual(34, document.Sections.Count());
+ }
+
+ [TestMethod]
+ public async Task EpubReaderShouldParseEpub3DocumentWithAnnotations()
+ {
+ using var stream = AssetFactory.GetAsset(AssetFactory.GetKOReaderLibrary(), "dialogues_seneca.epub");
+ var annotation = new Annotation(
+ "We shall never lack causes of anxiety, either pleasurable or painful: our life will be pushed along from one business to another: leisure will always be wished for, and never enjoyed.",
+ new DocumentReference { Title = "Seneca’s dialogues and consolations, including “On Benefits,” examine living life through the lens of Stoic philosophy." },
+ AnnotationType.Highlight,
+ new AnnotationContext
+ {
+ DocumentSection = new DocumentSection("XVII", 0, 0, null),
+ SerializedLocation = "epubxpath:///body/DocFragment[7]/body/section/section[17]/p/text()[2].2941-/body/DocFragment[7]/body/section/section[17]/p/text()[2].3124"
+ },
+ new DateTime(2023, 12, 23));
+
+ var document = await new EpubReader(this.logger).Read(stream, new Noted.Core.Extensions.ReaderOptions(), (_) => [annotation]);
+
+ var annotations = document.Annotations.ToList();
+ Assert.AreEqual(1, annotations.Count);
+ Assert.AreEqual("XVII", annotations[0].Context.DocumentSection.Title);
+ Assert.AreNotEqual(0, annotations[0].Context.DocumentSection.Location);
+ Assert.AreNotEqual(0, annotations[0].Context.Location);
+ Assert.AreEqual(552, document.Sections.Count());
  }
 }
diff --git a/test/assets/koreader/dialogues_seneca.epub b/test/assets/koreader/dialogues_seneca.epub