From bec390ef355270f7de43cf22ed21e746651b9cad Mon Sep 17 00:00:00 2001
From: Romain Deltour <rdeltour@gmail.com>
Date: Wed, 16 Nov 2022 10:31:10 +0100
Subject: [PATCH] feat: better parse URL fragment micro syntaxes

This commit introduce a new `URLFragment` class to represent URL fragments.

Fragment strings are parsed into `URLFragment` instances using MIME type-specific logic, implementing some validity checks for a few micro syntaxes
including:

- shortand bare name IDs
- scheme-based fragments
- media fragments

SVG and HTML/XHTML MIME types are supported.

The parser is tested in the `url-fragment.feature` feature file, in a new `unit-tests` directory.
---
 .../com/adobe/epubcheck/opf/OPFChecker.java   |   3 +
 .../com/adobe/epubcheck/opf/OPFChecker30.java |   5 +-
 .../com/adobe/epubcheck/opf/XRefChecker.java  |  69 ++-
 .../com/adobe/epubcheck/ops/OPSHandler.java   |   5 -
 .../org/w3c/epubcheck/constants/MIMEType.java |   2 +-
 .../org/w3c/epubcheck/url/URLFragment.java    | 410 ++++++++++++++++++
 .../w3c/epubcheck/url/URLFragmentSteps.java   |  67 +++
 .../EPUB/package.opf                          |   2 +-
 .../resources/unit-tests/url-fragment.feature | 105 +++++
 9 files changed, 636 insertions(+), 32 deletions(-)
 create mode 100644 src/main/java/org/w3c/epubcheck/url/URLFragment.java
 create mode 100644 src/test/java/org/w3c/epubcheck/url/URLFragmentSteps.java
 create mode 100644 src/test/resources/unit-tests/url-fragment.feature
diff --git a/src/main/java/com/adobe/epubcheck/opf/OPFChecker.java b/src/main/java/com/adobe/epubcheck/opf/OPFChecker.java
index 72a34f234..06a560bfb 100755
--- a/src/main/java/com/adobe/epubcheck/opf/OPFChecker.java
+++ b/src/main/java/com/adobe/epubcheck/opf/OPFChecker.java
@@ -116,6 +116,9 @@ protected boolean checkPackage()
 
     List<OPFItem> items = opfHandler.getItems();
     report.info(null, FeatureEnum.ITEMS_COUNT, Integer.toString(items.size()));
+    
+    // Register package doc and items to the XRefChecker
+    xrefChecker.registerResource(context.url, context.mimeType);
     for (OPFItem item : items)
     {
       xrefChecker.registerResource(item,
diff --git a/src/main/java/com/adobe/epubcheck/opf/OPFChecker30.java b/src/main/java/com/adobe/epubcheck/opf/OPFChecker30.java
index 08681930f..f249a1193 100644
--- a/src/main/java/com/adobe/epubcheck/opf/OPFChecker30.java
+++ b/src/main/java/com/adobe/epubcheck/opf/OPFChecker30.java
@@ -25,6 +25,8 @@
 import java.util.Iterator;
 import java.util.Set;
 
+import org.w3c.epubcheck.url.URLFragment;
+
 import com.adobe.epubcheck.api.EPUBLocation;
 import com.adobe.epubcheck.api.EPUBProfile;
 import com.adobe.epubcheck.api.FeatureReport.Feature;
@@ -387,7 +389,8 @@ private void checkPreviewCollection(ResourceCollection collection)
         }
         else
         {
-          if (Optional.fromNullable(resource.getURL().fragment()).or("").startsWith("epubcfi("))
+          URLFragment fragment = URLFragment.parse(resource.getURL());
+          if (fragment.exists() && "epubcfi".equals(fragment.getScheme()))
           {
             report.message(MessageId.OPF_076, EPUBLocation.of(context));
           }
diff --git a/src/main/java/com/adobe/epubcheck/opf/XRefChecker.java b/src/main/java/com/adobe/epubcheck/opf/XRefChecker.java
index 1820f402b..655478a2a 100755
--- a/src/main/java/com/adobe/epubcheck/opf/XRefChecker.java
+++ b/src/main/java/com/adobe/epubcheck/opf/XRefChecker.java
@@ -31,8 +31,9 @@
 import java.util.Map;
 import java.util.Queue;
 import java.util.Set;
-import java.util.regex.Pattern;
 
+import org.w3c.epubcheck.constants.MIMEType;
+import org.w3c.epubcheck.url.URLFragment;
 import org.w3c.epubcheck.url.URLUtils;
 
 import com.adobe.epubcheck.api.EPUBLocation;
@@ -126,6 +127,7 @@ public static final class Builder
       private OPFItem item = null;
       private boolean hasItemFallback = false;
       private boolean hasImageFallback = false;
+      public String mimetype;
 
       public Builder url(URL url)
       {
@@ -137,6 +139,13 @@ public Builder item(OPFItem item)
       {
         this.url = item.getURL();
         this.item = item;
+        this.mimetype = item.getMimeType();
+        return this;
+      }
+
+      public Builder mimetype(String mimetype)
+      {
+        this.mimetype = mimetype;
         return this;
       }
 
@@ -231,8 +240,6 @@ public boolean isInSpine()
     }
   }
 
-  private static final Pattern REGEX_SVG_VIEW = Pattern.compile("svgView\\(.*\\)");
-
   private final Map<URL, Resource> resources = new HashMap<URL, Resource>();
 
   private final Set<URL> undeclared = new HashSet<URL>();
@@ -281,7 +288,7 @@ public Optional<OPFItem> getResource(URL url)
    * @param path
    *          the path to a publication resource
    * @return an immutable {@link EnumSet} containing the types of references to
-   *         {@code path}.
+   *           {@code path}.
    */
   public Set<Type> getTypes(URL resource)
   {
@@ -413,9 +420,15 @@ public void checkReferences()
   private void checkReference(URLReference reference)
   {
     Resource hostResource = resources.get(reference.location.url);
-    Resource targetResource = resources.get(reference.targetDoc);
+
+    // Retrieve the Resource instance representing the targeted document
     // If the resource was not declared in the manifest,
     // we build a new Resource object for the data URL.
+    Resource targetResource = resources.get(reference.targetDoc);
+    String targetMimetype = (targetResource != null) ? targetResource.getMimeType() : "";
+
+    // Parse the URL fragment
+    URLFragment fragment = URLFragment.parse(reference.url, targetMimetype);
 
     // Check remote resources
     if (container.isRemote(reference.url)
@@ -470,15 +483,18 @@ else if (!undeclared.contains(reference.targetDoc)
       return;
     }
 
-    String mimetype = targetResource.getMimeType();
-
     // Type-specific checks
     switch (reference.type)
     {
     case HYPERLINK:
+      if ("epubcfi".equals(fragment.getScheme()))
+      {
+        break; // EPUB CFI is not supported
+      }
       // if mimeType is null, we should have reported an error already
-      if (!OPFChecker.isBlessedItemType(mimetype, version)
-          && !OPFChecker.isDeprecatedBlessedItemType(mimetype) && !targetResource.hasItemFallback())
+      if (!OPFChecker.isBlessedItemType(targetMimetype, version)
+          && !OPFChecker.isDeprecatedBlessedItemType(targetMimetype)
+          && !targetResource.hasItemFallback())
       {
         report.message(MessageId.RSC_010,
             reference.location.context(container.relativize(reference.url)));
@@ -494,31 +510,35 @@ else if (!undeclared.contains(reference.targetDoc)
     case IMAGE:
     case PICTURE_SOURCE:
     case PICTURE_SOURCE_FOREIGN:
-      if (reference.url.fragment() != null && !mimetype.equals("image/svg+xml"))
+      if ("epubcfi".equals(fragment.getScheme()))
+      {
+        break; // EPUB CFI is not supported
+      }
+      if (fragment.exists() && !MIMEType.SVG.is(targetMimetype))
       {
         report.message(MessageId.RSC_009,
             reference.location.context(container.relativize(reference.url)));
         return;
       }
       // if mimeType is null, we should have reported an error already
-      if (!OPFChecker.isBlessedImageType(mimetype, version))
+      if (!OPFChecker.isBlessedImageType(targetMimetype, version))
       {
         if (version == EPUBVersion.VERSION_3 && reference.type == Type.PICTURE_SOURCE)
         {
           report.message(MessageId.MED_007, reference.location,
-              container.relativize(reference.targetDoc), mimetype);
+              container.relativize(reference.targetDoc), targetMimetype);
           return;
         }
         else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
         {
           report.message(MessageId.MED_003, reference.location,
-              container.relativize(reference.targetDoc), mimetype);
+              container.relativize(reference.targetDoc), targetMimetype);
         }
       }
       break;
     case SEARCH_KEY:
       // TODO update when we support EPUB CFI
-      if ((reference.url.fragment() == null || !reference.url.fragment().startsWith("epubcfi("))
+      if ((!fragment.exists() || !"epubcfi".equals(fragment.getScheme()))
           && !targetResource.isInSpine())
       {
         report.message(MessageId.RSC_021, reference.location,
@@ -527,7 +547,7 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
       }
       break;
     case STYLESHEET:
-      if (reference.url.fragment() != null)
+      if (fragment.exists())
       {
         report.message(MessageId.RSC_013,
             reference.location.context(container.relativize(reference.url)));
@@ -551,7 +571,7 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
     case SVG_CLIP_PATH:
     case SVG_PAINT:
     case SVG_SYMBOL:
-      if (reference.url.fragment() == null)
+      if (!fragment.exists())
       {
         report.message(MessageId.RSC_015, reference.location.context(reference.url));
         return;
@@ -562,32 +582,32 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
     }
 
     // Fragment integrity checks
-    String fragment = reference.url.fragment();
-    if (fragment != null && !fragment.isEmpty())
+    if (fragment.exists() && !fragment.isEmpty())
     {
       // EPUB CFI
-      if (fragment.startsWith("epubcfi("))
+      if ("epubcfi".equals(fragment.getScheme()))
       {
+        // FIXME HOT should warn if in MO
         // FIXME epubcfi currently not supported (see issue 150).
         return;
       }
       // Media fragments in Data Navigation Documents
-      else if (fragment.contains("=") && hostResource != null && hostResource.hasItem()
+      else if (fragment.isMediaFragment() && hostResource != null && hostResource.hasItem()
           && hostResource.getItem().getProperties()
               .contains(PackageVocabs.ITEM_VOCAB.get(PackageVocabs.ITEM_PROPERTIES.DATA_NAV)))
       {
         // Ignore,
         return;
       }
-      // SVG view fragments are ignored
-      else if (mimetype.equals("image/svg+xml") && REGEX_SVG_VIEW.matcher(fragment).matches())
+      // Non-ID-based fragments are ignored
+      else if (fragment.getId().isEmpty())
       {
         return;
       }
       // Fragment Identifier (by default)
       else if (!container.isRemote(reference.targetDoc))
       {
-        ID anchor = targetResource.ids.get(fragment);
+        ID anchor = targetResource.ids.get(fragment.getId());
         if (anchor == null)
         {
           report.message(MessageId.RSC_012, reference.location.context(reference.url.toString()));
@@ -674,7 +694,8 @@ private void checkReadingOrder(Queue<URLReference> references, int lastSpinePosi
       }
 
       // check that the fragment is in document order
-      int targetAnchorPosition = res.getIDPosition(ref.url.fragment());
+      URLFragment fragment = URLFragment.parse(ref.url, res.getMimeType());
+      int targetAnchorPosition = res.getIDPosition(fragment.getId());
       if (targetAnchorPosition < lastAnchorPosition)
       {
         String orderContext = LocalizedMessages.getInstance(locale).getSuggestion(MessageId.NAV_011,
diff --git a/src/main/java/com/adobe/epubcheck/ops/OPSHandler.java b/src/main/java/com/adobe/epubcheck/ops/OPSHandler.java
index 7ce928429..ad3993eaa 100755
--- a/src/main/java/com/adobe/epubcheck/ops/OPSHandler.java
+++ b/src/main/java/com/adobe/epubcheck/ops/OPSHandler.java
@@ -141,11 +141,6 @@ else if (".".equals(href))
 
     // If the URL was not properly parsed, return early
     if (url == null) return;
-    // If the URL is an EPUB CFI, return (not implemented)
-    if (url.fragment() != null && url.fragment().matches("epubcfi\\(.*\\)"))
-    {
-      return; // temp until cfi implemented
-    }
 
     if ("file".equals(url.scheme()))
     {
diff --git a/src/main/java/org/w3c/epubcheck/constants/MIMEType.java b/src/main/java/org/w3c/epubcheck/constants/MIMEType.java
index 29dc92e2b..ced86f640 100644
--- a/src/main/java/org/w3c/epubcheck/constants/MIMEType.java
+++ b/src/main/java/org/w3c/epubcheck/constants/MIMEType.java
@@ -54,6 +54,6 @@ public boolean is(String string)
 
   public static MIMEType get(String name)
   {
-    return ENUM_MAP.getOrDefault(name.toLowerCase(Locale.ROOT), OTHER);
+    return (name != null) ? ENUM_MAP.getOrDefault(name.toLowerCase(Locale.ROOT), OTHER) : OTHER;
   }
 }
diff --git a/src/main/java/org/w3c/epubcheck/url/URLFragment.java b/src/main/java/org/w3c/epubcheck/url/URLFragment.java
new file mode 100644
index 000000000..6baf498ee
--- /dev/null
+++ b/src/main/java/org/w3c/epubcheck/url/URLFragment.java
@@ -0,0 +1,410 @@
+package org.w3c.epubcheck.url;
+
+import java.util.Iterator;
+import java.util.Objects;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.w3c.epubcheck.constants.MIMEType;
+
+import com.google.common.base.Splitter;
+import com.google.common.base.Strings;
+
+import io.mola.galimatias.URL;
+import io.mola.galimatias.URLUtils;
+import net.sf.saxon.om.NameChecker;
+
+/**
+ * Represents a URL fragment, after parsing micro-syntaxes.
+ */
+public class URLFragment
+{
+
+  /**
+   * Represents a non-existent fragment, for which {@link #exists()} returns
+   * <code>false</code>
+   */
+  public static final URLFragment NONE = new URLFragment(new Parser().parse(null, null));
+
+  private final String fragment;
+  private final String scheme;
+  private final String id;
+  private final boolean isMediaFragment;
+  private final boolean isValid;
+
+  private URLFragment(Parser parser)
+  {
+    this.fragment = parser.fragment;
+    this.id = Strings.nullToEmpty(parser.id);
+    this.scheme = Strings.nullToEmpty(parser.scheme);
+    this.isMediaFragment = parser.isMediaFragment;
+    this.isValid = parser.isValid;
+  }
+
+  /**
+   * Returns the element ID represented by this fragment if this is an ID-based
+   * fragment, or the empty string otherwise.
+   * 
+   * @return an element ID or the empty string.
+   */
+  public String getId()
+  {
+    return id;
+  }
+
+  /**
+   * Returns the scheme represented by this fragment if this is an scheme-based
+   * fragment, or the empty string otherwise.
+   * 
+   * @return a scheme name or the empty string.
+   */
+  public String getScheme()
+  {
+    return scheme;
+  }
+
+  /**
+   * @return <code>true</code> iff the URL from which this was parsed had a
+   *           fragment.
+   */
+  public boolean exists()
+  {
+    return fragment != null;
+  }
+
+  /**
+   * @return <code>true</code> iff this fragment is the empty string or
+   *           represents a non-existent fragment.
+   */
+  public boolean isEmpty()
+  {
+    return fragment == null || fragment.isEmpty();
+  }
+
+  /**
+   * @return <code>true</code> iff this fragment is valid according to its
+   *           target MIME type.
+   */
+  public boolean isValid()
+  {
+    return isValid;
+  }
+
+  /**
+   * @return <code>true</code> iff this fragment is a media fragment.
+   */
+  public boolean isMediaFragment()
+  {
+    return isMediaFragment;
+  }
+
+  @Override
+  /**
+   * @return the full fragment string.
+   */
+  public String toString()
+  {
+    return fragment;
+  }
+
+  @Override
+  public int hashCode()
+  {
+    return Objects.hash(fragment);
+  }
+
+  @Override
+  public boolean equals(Object obj)
+  {
+    if (this == obj) return true;
+    if (obj == null) return false;
+    if (getClass() != obj.getClass()) return false;
+    URLFragment other = (URLFragment) obj;
+    return Objects.equals(fragment, other.fragment);
+  }
+
+  /**
+   * Parse the fragment of the given URL, according to the rules defined for the
+   * given MIME type.
+   * 
+   * If the URL has no fragment, returns {@link #NONE}
+   * 
+   * <h2>HTML types "application/xhtml+xml" and "text/html"</h2>
+   *
+   * <p>
+   * The following fragment patterns are supported:
+   * </p>
+   * 
+   * <ul>
+   * <li>regular ID-based fragments (`#name`)</li>
+   * <li>scheme-based fragments (`#name(something)`)</li>
+   * <li>media fragments (`#name=value`, with name one of
+   * `t|xywh|track|id|xyn|xyr`</li>
+   * <li>fragment directives (`#name:~:text=range`)</li>
+   * </ul>
+   * 
+   * <p>
+   * Note that this deviates from the HTML standard in the following way:
+   * </p>
+   * 
+   * <ul>
+   * <li>HTML does not define specific logic for scheme-based or media
+   * fragments, which must be treated like any other IDs. However, EPUB makes
+   * use of them notably for EPUB CFI or region-based navigation.</li>
+   * <li>Fragment directives (as used in text fragments), is an incubating
+   * standard (at the time of writing) and is likely not well supported by
+   * reading system, but its syntax is specific enough to lower the risk of
+   * false-positive.</li>
+   * </ul>
+   * 
+   * <h2>SVG type "image/svg+xml"</h2>
+   *
+   * <p>
+   * The following fragment patterns are supported:
+   * </p>
+   * 
+   * <ul>
+   * <li>shorthand bare form names (<code>#name</code>). Validation checks that
+   * the name is an XML NCName.</li>
+   * <li>SVG view specification (<code>#svgView(…)</code>). Validation currently
+   * does not look into the parenthesis content.</li>
+   * <li>basic media fragments (<code>#xywh=0,0,50,50</code>). Validation checks
+   * the syntax of spatial and temporal dimensions.</li>
+   * </ul>
+   * 
+   * <h2>Other type</h2>
+   * 
+   * <p>
+   * Any other type is assumed to be XML. The following fragment patterns are
+   * supported:
+   * </p>
+   * 
+   * <ul>
+   * <li>shorthand bare form names (<code>#name</code>). Validation checks that
+   * the name is an XML NCName.</li>
+   * <li>scheme-based fragments (`#name(something)`). No validation of the
+   * scheme name or syntax.</li>
+   * </ul>
+   * 
+   * @param url
+   *          a URL
+   * @param mimetype
+   *          the MIME type of the URL target
+   * @return a parsed fragment (cannot be <code>null</code>)
+   */
+  public static URLFragment parse(URL url, String mimetype)
+  {
+    if (url == null || url.fragment() == null)
+    {
+      return NONE;
+    }
+    else
+    {
+      return new URLFragment(new Parser().parse(url.fragment(), mimetype));
+    }
+  }
+
+  /**
+   * Parse the fragment of the given URL, according to the default rules (XML
+   * MIME type), see {@link URLFragment#parse(URL, String)}.
+   * 
+   * @param url
+   *          a URL
+   * @return a parsed fragment (cannot be<code>null</code>)
+   */
+  public static URLFragment parse(URL url)
+  {
+    return parse(url, "");
+  }
+
+  private static final class Parser
+  {
+    private String fragment;
+    private String scheme;
+    private String id;
+    private boolean isMediaFragment = false;
+    private boolean isValid = true;
+
+    /*
+     * Parse the fragment, by dispatching to a type-specific method.
+     * 
+     * Note (2022): parsing would likely be more efficient if implemented as a
+     * state parser instead of using regex-based string matching.
+     */
+    private Parser parse(String fragment, String mimetype)
+    {
+      this.fragment = fragment;
+      if (fragment != null)
+      {
+        switch (MIMEType.get(mimetype))
+        {
+        case SVG:
+          parseSVGFragment(fragment);
+          break;
+        case HTML:
+        case XHTML:
+          parseHTMLFragment(fragment);
+          break;
+        default:
+          parseXMLFragment(fragment);
+          break;
+        }
+      }
+      return this;
+    }
+
+    private static final Pattern SCHEME_BASED = Pattern.compile("(\\w+)\\(.*\\)");
+    private static final Pattern MEDIA_FRAGMENT = Pattern
+        .compile("(t|xywh|track|id|xyn|xyr)=[^&]+(&[^&=]+=[^&]+)*");
+
+    // Parses an XML fragment identifier
+    private void parseXMLFragment(String fragment)
+    {
+      Matcher matcher;
+      // Schema based
+      if ((matcher = SCHEME_BASED.matcher(fragment)).matches())
+      {
+        this.scheme = matcher.group(1);
+      }
+      // ID fragment
+      else
+      {
+        this.id = URLUtils.percentDecode(fragment);
+        this.isValid = NameChecker.isValidNCName(id);
+      }
+    }
+
+    /*
+     * Parses an HTML fragment identifier
+     */
+    private void parseHTMLFragment(String fragment)
+    {
+      Matcher matcher;
+      // strip fragment directive
+      // see https://wicg.github.io/scroll-to-text-fragment/
+      int index;
+      if ((index = fragment.indexOf(":~:")) > -1)
+      {
+        fragment = fragment.substring(0, index);
+      }
+      // scheme-based fragment
+      if ((matcher = SCHEME_BASED.matcher(fragment)).matches())
+      {
+        this.scheme = matcher.group(1);
+      }
+      // media fragment
+      else if ((matcher = MEDIA_FRAGMENT.matcher(fragment)).matches())
+      {
+        this.isMediaFragment = true;
+      }
+      // ID fragment
+      else
+      {
+        this.id = URLUtils.percentDecode(fragment);
+      }
+    }
+
+    /*
+     * Parses an SVG fragment identifier, see:
+     * https://www.w3.org/TR/SVG/linking.html#SVGFragmentIdentifiersDefinitions
+     */
+    private void parseSVGFragment(String fragment)
+    {
+
+      if (fragment.isEmpty()) return;
+
+      // Split the fragment into &-separated components
+      Iterator<String> components = Splitter.on('&').split(fragment).iterator();
+      String first = components.next();
+
+      // SVG view specification
+      if (first.startsWith("svgView("))
+      {
+        // check the SVG view is well-formed
+        isValid = parseSVGView(first);
+        // check optional remaining components are well-formed time segments
+        while (isValid && components.hasNext())
+        {
+          isValid = parseTimeSegment(components.next());
+        }
+      }
+      // Temporal media fragment
+      else if (first.startsWith("t="))
+      {
+        isMediaFragment = true;
+        // check the first component is a well-formed time segment
+        isValid = parseTimeSegment(first);
+        // check optional remaining components are well-formed space segments
+        while (isValid && components.hasNext())
+        {
+          isValid = parseSpaceSegment(components.next());
+        }
+      }
+      // Spatial media fragment
+      else if (first.startsWith("xywh="))
+      {
+        isMediaFragment = true;
+        // check the first component is a well-formed space segment
+        isValid = parseSpaceSegment(first);
+        // check optional remaining components are well-formed time segments
+        while (isValid && components.hasNext())
+        {
+          isValid = parseTimeSegment(components.next());
+        }
+      }
+      else if (first.contains("="))
+      {
+        isValid = false;
+      }
+      // Shorthand bare name
+      else
+      {
+        // Record the ID, percent-decoded
+        this.id = URLUtils.percentDecode(first);
+        // check validity of the ID
+        this.isValid = NameChecker.isValidNCName(id);
+        // check optional remaining components are well-formed time segments
+        while (isValid && components.hasNext())
+        {
+          isValid = parseTimeSegment(components.next());
+        }
+      }
+    }
+
+    private static final Pattern SVGVIEW = Pattern.compile("svgView\\(.+\\)");
+
+    private boolean parseSVGView(String string)
+    {
+      return isValid = SVGVIEW.matcher(string).matches();
+    }
+
+    private static final Pattern SPATIAL = Pattern
+        .compile("xywh=(pixel:|percent:)?\\d+,\\d+,\\d+,\\d+");
+
+    private boolean parseSpaceSegment(String string)
+    {
+      return isValid = SPATIAL.matcher(string).matches();
+    }
+
+    private static final Pattern TEMPORAL = Pattern
+        .compile("t=(?:npt:)?(?:([0-9.:]+)(?:,([0-9.:]+))?|,([0-9.:]+))");
+    private static final Pattern NPTTIME = Pattern
+        .compile("((\\d+)|([0-5]\\d:[0-5]\\d)|(\\d+:[0-5]\\d:[0-5]\\d))(\\.\\d*)?");
+
+    private boolean parseTimeSegment(String string)
+    {
+      Matcher matcher = TEMPORAL.matcher(string);
+      if (isValid = matcher.matches())
+      {
+        int i = 1;
+        while (isValid && i <= matcher.groupCount())
+        {
+          isValid = matcher.group(i) == null || NPTTIME.matcher(matcher.group(i)).matches();
+          i++;
+        }
+      }
+      return isValid;
+    }
+  }
+
+}
diff --git a/src/test/java/org/w3c/epubcheck/url/URLFragmentSteps.java b/src/test/java/org/w3c/epubcheck/url/URLFragmentSteps.java
new file mode 100644
index 000000000..4f62a0b74
--- /dev/null
+++ b/src/test/java/org/w3c/epubcheck/url/URLFragmentSteps.java
@@ -0,0 +1,67 @@
+package org.w3c.epubcheck.url;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.emptyString;
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertTrue;
+
+import java.net.URI;
+
+import org.w3c.epubcheck.constants.MIMEType;
+
+import com.google.common.base.Enums;
+
+import io.cucumber.java.en.Then;
+import io.mola.galimatias.GalimatiasParseException;
+import io.mola.galimatias.URL;
+
+public class URLFragmentSteps
+{
+
+  private static final URL BASE_URL = URL.fromJavaURI(URI.create("https://example.org"));
+
+  private URLFragment result;
+
+  @Then("{string} is a {} {} fragment")
+  public void testSVGFragment(String fragment, String validity, String type)
+  {
+    result = parse(fragment,
+        Enums.getIfPresent(MIMEType.class, type).or(MIMEType.OTHER).toString());
+    assertThat((result.isValid()) ? "valid" : "invalid", is(validity));
+  }
+
+  @Then("it indicates an element with ID {string}")
+  public void assertID(String id)
+  {
+    assertThat(result.getId(), is(id));
+  }
+
+  @Then("it does not indicate an element")
+  public void assertIDIsEmpty()
+  {
+    assertThat(result.getId(), is(emptyString()));
+  }
+
+  @Then("it is a media fragment")
+  public void assertMediaFragment()
+  {
+    assertTrue(result.isMediaFragment());
+  }
+
+  @Then("it has scheme {string}")
+  public void assertScheme(String scheme)
+  {
+    assertThat(result.getScheme(), is(scheme));
+  }
+
+  private URLFragment parse(String fragment, String mimetype)
+  {
+    try
+    {
+      return URLFragment.parse(BASE_URL.withFragment(fragment), mimetype);
+    } catch (GalimatiasParseException e)
+    {
+      throw new AssertionError("Could not create URL with fragment " + fragment, e);
+    }
+  }
+}
diff --git a/src/test/resources/epub-previews/files/epub/preview-embedded-link-cfi-error/EPUB/package.opf b/src/test/resources/epub-previews/files/epub/preview-embedded-link-cfi-error/EPUB/package.opf
index 07be897c3..084ecc944 100644
--- a/src/test/resources/epub-previews/files/epub/preview-embedded-link-cfi-error/EPUB/package.opf
+++ b/src/test/resources/epub-previews/files/epub/preview-embedded-link-cfi-error/EPUB/package.opf
@@ -24,6 +24,6 @@
     <link href="content_002.xhtml" media-type="application/xhtml+xml"/>
     <link href="style.css" media-type="text/css"/>
   </collection>
-  <link href="content_001.xhtml#epubcfi(/6/2!/4/2/1:3"/>
+  <link href="content_001.xhtml#epubcfi(/6/2!/4/2/1:3)"/>
 </collection>
 </package>
\ No newline at end of file
diff --git a/src/test/resources/unit-tests/url-fragment.feature b/src/test/resources/unit-tests/url-fragment.feature
new file mode 100644
index 000000000..aba0e8712
--- /dev/null
+++ b/src/test/resources/unit-tests/url-fragment.feature
@@ -0,0 +1,105 @@
+Feature: URL fragment parser
+  
+  Tests the parser for URL fragments
+    
+  Scenario Outline: HTML ID-based fragment <fragment>
+		* <fragment> is a valid HTML fragment
+		And it indicates an element with ID <id>
+
+    Scenarios:
+      | fragment        | id   |
+      | "id"            | "id" |
+      | "%40%40"        | "@@" |
+      | "id:~:text=a,b" | "id" |
+
+    Scenarios: Text fragments (experimental)
+      | fragment         |  id  |
+      | "id:~:text=a,b"  | "id" |
+      | ":~:text=a,b"    |  ""  |
+
+    Scenarios: "invalid" non-ID-based fragments are processed as IDs
+      | fragment   |  id         |
+      | "foo=bar"  |  "foo=bar"  |
+      | "epubcfi(" |  "epubcfi(" |
+
+
+  Scenario Outline: HTML scheme-based fragment <fragment>
+		* <fragment> is a valid HTML fragment
+		And it has scheme <scheme>
+		And it does not indicate an element
+
+    Scenarios:
+      | fragment            |   scheme   |
+      | "xpointer(id(foo))" | "xpointer" |
+      | "epubcfi(/6/4[chap01ref]!/4[body01])" |  "epubcfi"  |
+
+  Scenario Outline: HTML media fragment <fragment>
+		* <fragment> is a valid HTML fragment
+		And it is a media fragment
+		And it does not indicate an element
+
+    Scenarios:
+      | fragment       |
+      | "xywh=1,1,1,1" |
+      | "t=10"         |
+      | "track=audio"  |
+      | "id=foo"       |
+
+  Scenario Outline: SVG shorthand fragment <fragment>
+		* <fragment> is a <validity> SVG fragment
+		And it indicates an element with ID <id>
+
+    Scenarios: Shorthand fragments
+      
+      | fragment         | validity |  id   |
+      | "id"             |  valid   | "id"  |
+      | "id&t=10"        |  valid   | "id"  |
+      | "id&t=10&t=5"    |  valid   | "id"  |
+      | "id&foo=bar"     | invalid  | "id"  |
+      | "id&t="          | invalid  | "id"  |
+      | "id&"            | invalid  | "id"  |
+      | "*id"            | invalid  | "*id" | (not an NCName)
+      | "%40%40"         | invalid  | "@@"  | (not an NCName)
+    
+    
+  Scenario Outline: SVG media fragment <fragment>
+		* <fragment> is a <validity> SVG fragment
+		
+    Scenarios: Temporal media fragment
+      | fragment             | validity |
+      | "t=npt:10,20"        |  valid   |
+      | "t=npt:,121.5"       |  valid   |
+      | "t=0:02:00,121.5"    |  valid   |
+      | "t=npt:120,0:02:01." |  valid   |
+      | "t=60:00"            | invalid  |
+      | "t=00:99"            | invalid  |
+      | "t=123:00:00"        |  valid   |
+      | "t=10&xywh=0,0,1,1"  |  valid   |
+		
+    Scenarios: Spatial media fragment
+      | fragment                     | validity |
+      | "xywh=160,120,320,240"       |  valid   |
+      | "xywh=pixel:160,120,320,240" |  valid   |
+      | "xywh=percent:25,25,50,50"   |  valid   |
+      | "xywh=160,120,320"           | invalid  |
+      | "xywh=px:160,120,320,240"    | invalid  |
+		
+    Scenarios: SVG view specification
+      | fragment                                 | validity |
+      | "svgView(viewBox(0,0,200,200))"          |  valid   |
+      | "svgView(preserveAspectRatio(xMidYMid))" |  valid   |
+      | "svgView(transform(scale(5))"            |  valid   |
+      | "svgView()"                              | invalid  |
+      | "svgView(viewBox(0,0,200,200"            | invalid  |
+    
+  Scenario Outline: SVG invalid fragments <fragment>
+    Should not be parsed as legit IDs
+		* <fragment> is a <validity> SVG fragment
+		And it indicates an element with ID <id>
+
+    Scenarios: Unknown or invalid media fragments
+      | fragment  | validity | id  |
+      | "foo=bar" | invalid  | ""  |
+      | "foo="    | invalid  | ""  |
+      | "=foo"    | invalid  | ""  |
+