Skip to content

Commit

Permalink
fix: add debug mode, utf-8 for koreader metadata files, revamp the se…
Browse files Browse the repository at this point in the history
…ction alignment logic for epubs to use title or nav anchor.
  • Loading branch information
codito committed Dec 23, 2023
1 parent 7b5cded commit e97cbee
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 16 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
**TL;DR** noted is a command line app to liberate your highlights and notes.

Noted extracts annotations embedded into documents (pdf), or collects them from
readers (kindle). It tries to align them with the chapters and context to
readers (kindle, or koreader). It tries to align them with the chapters and context to
produce a plain text markdown file.

## Features

**** Extracts annotations (highlights and notes) for documents and books
**** Extracts _context_ and _chapter headings_ along with the annotations
**** Supports `pdf` and `mobi` files
**** Supports `pdf`, `epub` and `mobi` files
**** Detects kindle `My Clippings.txt` files
**** Detects koreader `*.sdr` directories
**** Saves all the extracted information in markdown text

## Installation
Expand Down Expand Up @@ -85,7 +86,7 @@ Completed in 1.71s.
## Roadmap

- Support for `kfx` and `azw3` files in kindle
- Support for additional readers like `koreader`, `kobo` etc.
- Support for additional readers like `kobo` etc.

Contributions in any form e.g. bug reports, feature requests or PRs are most
welcome!
Expand Down
4 changes: 3 additions & 1 deletion src/Noted/Core/ExtractWorkflow.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ public async Task<int> RunAsync(Configuration configuration)
continue;
}

// TODO extract file scoped external annotations (KOReader)
// Extract if the document has _external_ annotations (kindle, koreader) or
// can have _embedded_ annotations (like pdf).
// FIXME
this.Raise(new ExtractionStartedEventArgs { FileName = file });
await using var stream = this.fileSystem.OpenPathForRead(file);
var document = await reader.Read(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ namespace Noted.Extensions.Libraries.KOReader;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using NLua;
using Noted.Core.Extensions;
using Noted.Core.Models;
Expand Down Expand Up @@ -34,8 +35,14 @@ public IEnumerable<Annotation> GetAnnotations(string sourcePath)
foreach (var annotation in annotationFiles)
{
using var lua = new Lua();
lua.State.Encoding = Encoding.UTF8;
var annotationTable = GetLuaTable(lua, lua.DoFile(annotation)[0]);
var bookmarksTable = GetLuaTable(lua, annotationTable["bookmarks"]);
if (!annotationTable.TryGetValue("bookmarks", out var bookmarkNode) || bookmarkNode == null)
{
continue;
}

var bookmarksTable = GetLuaTable(lua, bookmarkNode);
var highlightTable = GetLuaTable(lua, annotationTable["highlight"]);
var highlights = highlightTable.Values
.SelectMany(h => GetLuaTable(lua, h).Values)
Expand Down Expand Up @@ -66,9 +73,10 @@ public IEnumerable<Annotation> GetAnnotations(string sourcePath)
var pos0 = bookmarkDict["pos0"].ToString();
var pos1 = bookmarkDict["pos1"].ToString();
bookmarkDict.TryGetValue("chapter", out var chapterTitle);
var epubXPath = new EpubXPathLocation(pos0!, pos1!);
var context = new AnnotationContext()
{
SerializedLocation = new EpubXPathLocation(pos0!, pos1!).ToString(),
SerializedLocation = epubXPath.ToString(),
DocumentSection = new DocumentSection(chapterTitle?.ToString() ?? string.Empty, 0, 0, null)
};
yield return new Annotation(
Expand Down
74 changes: 65 additions & 9 deletions src/Noted/Extensions/Readers/EpubReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
namespace Noted.Extensions.Readers
{
using System;
using System.CodeDom;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using AngleSharp.Dom;
using AngleSharp.Html.Parser;
using AngleSharp.XPath;
using MiscUtil.Xml.Linq.Extensions;
using Noted.Core.Extensions;
using Noted.Core.Models;
using Noted.Core.Platform.IO;
Expand Down Expand Up @@ -42,27 +44,36 @@ public async Task<Document> Read(
Annotation: a))
.OrderBy(p => p.Location.Start.DocumentFragmentId)
.ToList();
if (externalAnnotations.Count == 0)
{
return new Document { Title = docRef.Title, Author = docRef.Author };
}

var sections = ParseNavigation(epub);
var content = new Dictionary<int, string>();
var parser = new HtmlParser(new HtmlParserOptions
{
IsKeepingSourceReferences = true
});
var annotationIndex = 0;
foreach (var annotationTuple in externalAnnotations)
{
var docIndex = annotationTuple.Location.Start.DocumentFragmentId;
var document = await parser.ParseDocumentAsync(epub.ReadingOrder[docIndex - 1].Content);
var docIndex = annotationTuple.Location.Start.DocumentFragmentId - 1;
var document = await parser.ParseDocumentAsync(epub.ReadingOrder[docIndex].Content);

var annotation = annotationTuple.Annotation;
var allNodesInDocument = document.Body.SelectNodes("//*");
var startNode = document.Body.SelectSingleNode($"/{annotationTuple.Location.Start.XPath}");
var endNode = document.Body.SelectSingleNode($"/{annotationTuple.Location.End.XPath}");

var context = GetContext(allNodesInDocument, startNode, endNode);
annotation.Context.DocumentSection = sections[annotation.Context.DocumentSection!.Title];
annotation.Context.Location = ((docIndex - 1) * 1000) + context.Item1;
annotation.Context.DocumentSection = GetSectionForAnnotation(epub, sections, docIndex, annotation.Context.DocumentSection!.Title, startNode);
annotation.Context.Location = ((docIndex - 1) * 1000) +
context.Item1 == -1 ? annotationIndex : context.Item1;
annotation.Context.Content = context.Item2;
annotations.Add(annotation);

annotationIndex++;
}

var sortedSections = sections.Values.OrderBy(s => s.Location).ToList();
Expand All @@ -77,8 +88,13 @@ public async Task<Document> Read(

private static Tuple<int, string> GetContext(List<INode> allNodes, INode start, INode end)
{
var startSelector = start.ParentElement!.GetSelector();
var endSelector = end.ParentElement!.GetSelector();
var startSelector = start?.ParentElement?.GetSelector();
var endSelector = end?.ParentElement?.GetSelector();
if (startSelector == null || endSelector == null)
{
return new(-1, string.Empty);
}

var nodesBetween = new List<string>();
var startLocation = 0;

Expand Down Expand Up @@ -111,18 +127,58 @@ private static Tuple<int, string> GetContext(List<INode> allNodes, INode start,
return new(startLocation, string.Join(Environment.NewLine, nodesBetween));
}

private static DocumentSection GetSectionForAnnotation(EpubBook epub, Dictionary<string, DocumentSection> sections, int docFragmentId, string title, INode startPath)
{
// Strategy 1: locate the section by title
// For older epub documents which don't have well formatted toc navigation.
if (startPath == null)
{
return sections.Where(s => s.Key.Contains(title)).FirstOrDefault().Value;
}

// Strategy 2: locate the section by anchor from nav element
// For epub 3 etc. with well formatted nav.
foreach (var anc in startPath.GetAncestors())
{
var sectionAnchor = (anc as IElement)?.Id ?? string.Empty;
var key = $"{epub.ReadingOrder[docFragmentId].FilePath}-{sectionAnchor}-{title ?? string.Empty}";

if (sections.TryGetValue(key, out var section))
{
return section;
}
}

// Strategy 3: fallback to title match.
if (!string.IsNullOrEmpty(title))
{
return sections.Where(s => s.Key.Contains(title)).FirstOrDefault().Value;
}

return null!;
}

private static void NavigationDfs(
EpubNavigationItem root,
Dictionary<string, DocumentSection> result,
DocumentSection parent,
int level,
ref int index)
{
var rootSection = new DocumentSection(root.Title, level, ++index * 1000, parent);
result.Add(root.Title, rootSection);
// Skip adding HEADER type nodes as toc
var rootSection = root.Type == EpubNavigationItemType.HEADER ? null
: new DocumentSection(root.Title, level, ++index * 1000, parent);
if (rootSection != null)
{
// Using an unique combination because the docFragmentId and title can be unique
// in a specific book. Just title may not be unique.
// For example: text/foo.xhtml-II
result.Add($"{root.Link?.ContentFilePath ?? string.Empty}-{root.Link?.Anchor ?? string.Empty}-{root.Title}", rootSection);
}

foreach (var nestedItem in root.NestedItems)
{
NavigationDfs(nestedItem, result, rootSection, level + 1, ref index);
NavigationDfs(nestedItem, result, rootSection!, level + 1, ref index);
}
}

Expand Down
18 changes: 18 additions & 0 deletions src/Noted/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ namespace Noted
{
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Threading.Tasks;
using Noted.Core;
using Noted.Core.Extensions;
Expand Down Expand Up @@ -39,11 +40,28 @@ public static async Task<int> Main(string[] args)
var workflows = new Dictionary<string, Func<Configuration, IWorkflow>>
{ { "extract", config => new ExtractWorkflow(config.FileSystem, config.Logger) } };

WaitForDebuggerIfEnabled();

return await new ConsoleInterface()
.WithArguments(args)
.WithConfigurationProvider(configurationProvider)
.WithWorkflows(workflows)
.RunAsync();
}

private static void WaitForDebuggerIfEnabled()
{
var debugEnv = Environment.GetEnvironmentVariable("NOTED_DEBUG");
if (string.IsNullOrEmpty(debugEnv))
{
return;
}

System.Console.WriteLine($"Please attach debugger to pid: {Environment.ProcessId}");
while (!Debugger.IsAttached)
{
Debugger.Break();
}
}
}
}
28 changes: 27 additions & 1 deletion test/Noted.Tests/Extensions/Readers/EpubReaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public async Task EpubReaderShouldParseDocumentWithSections()
var document = await new EpubReader(this.logger).Read(stream, new Noted.Core.Extensions.ReaderOptions(), (_) => []);

Assert.AreEqual(0, document.Annotations.Count());
Assert.AreEqual(35, document.Sections.Count());
Assert.AreEqual(0, document.Sections.Count());
Assert.AreEqual("The Prophet", document.Title);
Assert.AreEqual("Khalil Gibran", document.Author);
}
Expand All @@ -57,5 +57,31 @@ public async Task EpubReaderShouldParseDocumentWithAnnotations()
Assert.AreEqual("On Giving", annotations[0].Context.DocumentSection.Title);
Assert.AreNotEqual(0, annotations[0].Context.DocumentSection.Location);
Assert.AreNotEqual(0, annotations[0].Context.Location);
Assert.AreEqual(34, document.Sections.Count());
}

[TestMethod]
public async Task EpubReaderShouldParseEpub3DocumentWithAnnotations()
{
using var stream = AssetFactory.GetAsset(AssetFactory.GetKOReaderLibrary(), "dialogues_seneca.epub");
var annotation = new Annotation(
"We shall never lack causes of anxiety, either pleasurable or painful: our life will be pushed along from one business to another: leisure will always be wished for, and never enjoyed.",
new DocumentReference { Title = "Seneca’s dialogues and consolations, including “On Benefits,” examine living life through the lens of Stoic philosophy." },
AnnotationType.Highlight,
new AnnotationContext
{
DocumentSection = new DocumentSection("XVII", 0, 0, null),
SerializedLocation = "epubxpath:///body/DocFragment[7]/body/section/section[17]/p/text()[2].2941-/body/DocFragment[7]/body/section/section[17]/p/text()[2].3124"
},
new DateTime(2023, 12, 23));

var document = await new EpubReader(this.logger).Read(stream, new Noted.Core.Extensions.ReaderOptions(), (_) => [annotation]);

var annotations = document.Annotations.ToList();
Assert.AreEqual(1, annotations.Count);
Assert.AreEqual("XVII", annotations[0].Context.DocumentSection.Title);
Assert.AreNotEqual(0, annotations[0].Context.DocumentSection.Location);
Assert.AreNotEqual(0, annotations[0].Context.Location);
Assert.AreEqual(552, document.Sections.Count());
}
}
Binary file added test/assets/koreader/dialogues_seneca.epub
Binary file not shown.

0 comments on commit e97cbee

Please sign in to comment.