contenthash: implement proper Linux symlink semantics for getFollowLinks

This patch is part of a series which fixes the symlink resolution semantics within BuildKit. You cannot implement symlink resolution on Linux naively using path.Join. A correct implementation requires tracking the current path and applying each new component individually. This implementation is loosely based on github.com/cyphar/filepath-securejoin. Things to note: * The previous implementation of getFollowLinks actually only resolved symlinks in parent components of the path, leading to some callers to have to implement resolution manually (and incorrectly) by calling getFollowLinks several times. In addition to being incorrect and somewhat difficult to follow, it also lead to the ELOOP limit being much higher than 255 (while some callers used getFollowLinksWalk, most used getFollowLinks which reset the limit for each iteration). So, add getFollowParentLinks to allow for callers to decide which behaviour they need. getFollowLinks now follows all links (correctly). * The trailing-slash-is-significant behaviour in the cache (dir vs dir header) needs to be handled specially because on Linux there is no distinction between "a/" and "a" (assuming a is not a symlink, that is) and so filepath-securejoin's implementation didn't care about trailing slashes. The previous implementation hid the trailing path behaviour purely in the splitKey() implementation, making the need for this quite subtle. * The previous implementation was recursive, which in theory would allow you to find some paths slightly more quickly (if you find a valid ancestor you don't need to check above it) at the cost of making some lookups more expensive (a path with an invalid ancestor very early on in the path). However, implementing the correct lookup algorithm recursively proved to be quite difficult. It is possible to implement a similar optimisation (try to find the first non-symlink parent component and iterate from there), this complicates the implementation a fair amount and it doesn't seem clear that the performance tradeoff is a benefit in general. Ultimately, cache lookups are quite fast and so there probably isn't a practical performance difference between approaches. Signed-off-by: Aleksa Sarai <[email protected]>
moby · May 2, 2024 · 4fa7ea6 · 4fa7ea6
1 parent bcacb3e
commit 4fa7ea6
Showing 1 changed file with 88 additions and 56 deletions.
diff --git a/cache/contenthash/checksum.go b/cache/contenthash/checksum.go
@@ -542,7 +542,7 @@ func (cc *cacheContext) includedPaths(ctx context.Context, m *mount, p string, o
  // involves a symlink. That will match fsutil behavior of
  // calling functions such as stat and walk.
  var cr *CacheRecord
- k, cr, err = getFollowLinks(root, k, true)
+ k, cr, err = getFollowParentLinks(root, k, true)
  if err != nil {
  return nil, err
  }
@@ -751,36 +751,16 @@ func wildcardPrefix(root *iradix.Node, p string) (string, []byte, bool, error) {
  return "", nil, false, nil
  }
 
- linksWalked := 0
- k, cr, err := getFollowLinksWalk(root, convertPathToKey([]byte(d1)), true, &linksWalked)
+ // Only resolve the final symlink component if there are components in the
+ // wildcard segment.
+ resolveFn := getFollowParentLinks
+ if d2 != "" {
+ resolveFn = getFollowLinks
+ }
+ k, cr, err := resolveFn(root, convertPathToKey([]byte(d1)), true)
  if err != nil {
  return "", k, false, err
  }
-
- if d2 != "" && cr != nil && cr.Type == CacheRecordTypeSymlink {
- // getFollowLinks only handles symlinks in path
- // components before the last component, so
- // handle last component in d1 specially.
- resolved := string(convertKeyToPath(k))
- for {
- v, ok := root.Get(k)
-
- if !ok {
- return d1, k, false, nil
- }
- if v.(*CacheRecord).Type != CacheRecordTypeSymlink {
- break
- }
-
- linksWalked++
- if linksWalked > 255 {
- return "", k, false, errors.Errorf("too many links")
- }
-
- resolved := cleanLink(resolved, v.(*CacheRecord).Linkname)
- k = convertPathToKey([]byte(resolved))
- }
- }
  return d1, k, cr != nil, nil
 }
 
@@ -892,7 +872,7 @@ func (cc *cacheContext) lazyChecksum(ctx context.Context, m *mount, p string) (*
 
 func (cc *cacheContext) checksum(ctx context.Context, root *iradix.Node, txn *iradix.Txn, m *mount, k []byte, follow bool) (*CacheRecord, bool, error) {
  origk := k
- k, cr, err := getFollowLinks(root, k, follow)
+ k, cr, err := getFollowParentLinks(root, k, follow)
  if err != nil {
  return nil, false, err
  }
@@ -1071,12 +1051,10 @@ func (cc *cacheContext) scanPath(ctx context.Context, m *mount, p string) (retEr
  return nil
 }
 
-func getFollowLinks(root *iradix.Node, k []byte, follow bool) ([]byte, *CacheRecord, error) {
- var linksWalked int
- return getFollowLinksWalk(root, k, follow, &linksWalked)
-}
-
-func getFollowLinksWalk(root *iradix.Node, k []byte, follow bool, linksWalked *int) ([]byte, *CacheRecord, error) {
+// getFollowParentLinks is effectively O_PATH|O_NOFOLLOW, where the final
+// component is looked up without doing any symlink resolution (if it is a
+// symlink).
+func getFollowParentLinks(root *iradix.Node, k []byte, follow bool) ([]byte, *CacheRecord, error) {
  v, ok := root.Get(k)
  if ok {
  return k, v.(*CacheRecord), nil
@@ -1085,41 +1063,95 @@ func getFollowLinksWalk(root *iradix.Node, k []byte, follow bool, linksWalked *i
  return k, nil, nil
  }
 
+ // Only fully evaluate the parent path.
  dir, file := splitKey(k)
-
- k, parent, err := getFollowLinksWalk(root, dir, follow, linksWalked)
+ dir, _, err := getFollowLinks(root, dir, follow)
  if err != nil {
  return nil, nil, err
  }
- if parent != nil {
- if parent.Type == CacheRecordTypeSymlink {
- *linksWalked++
- if *linksWalked > 255 {
- return nil, nil, errors.Errorf("too many links")
- }
 
- link := cleanLink(string(convertKeyToPath(dir)), parent.Linkname)
- return getFollowLinksWalk(root, append(convertPathToKey([]byte(link)), file...), follow, linksWalked)
- }
- }
- k = append(k, file...)
+ // Do a direct lookup of the final component.
+ k = append(dir, file...)
  v, ok = root.Get(k)
  if ok {
  return k, v.(*CacheRecord), nil
  }
  return k, nil, nil
 }
 
-func cleanLink(dir, linkname string) string {
- dirPath := path.Clean(dir)
- if dirPath == "." || dirPath == "/" {
- dirPath = ""
+func getFollowLinks(root *iradix.Node, k []byte, follow bool) ([]byte, *CacheRecord, error) {
+ v, ok := root.Get(k)
+ if ok && v.(*CacheRecord).Type != CacheRecordTypeSymlink {
+ return k, v.(*CacheRecord), nil
+ }
+ if !follow || len(k) == 0 {
+ return k, nil, nil
+ }
+
+ var (
+ currentPath = "/"
+ remainingPath = string(convertKeyToPath(k))
+ linksWalked int
+ cr *CacheRecord
+ )
+ // Trailing slashes are significant for the cache, but path.Clean strips
+ // them. We only care about the slash for the final lookup.
+ remainingPath, hadTrailingSlash := strings.CutSuffix(remainingPath, "/")
+ for remainingPath != "" {
+ // Get next component.
+ var part string
+ if i := strings.IndexRune(remainingPath, '/'); i == -1 {
+ part, remainingPath = remainingPath, ""
+ } else {
+ part, remainingPath = remainingPath[:i], remainingPath[i+1:]
+ }
+
+ // Apply the component to the path. Since it is a single component, and
+ // our current path contains no symlinks, we can just apply it
+ // leixically.
+ nextPath := path.Join("/", currentPath, part)
+ if nextPath == "/" {
+ // If we hit the root, just treat it as a directory.
+ currentPath = "/"
+ continue
+ }
+ if nextPath == currentPath {
+ // noop component
+ continue
+ }
+
+ cr = nil
+ v, ok := root.Get(convertPathToKey([]byte(nextPath)))
+ if ok {
+ cr = v.(*CacheRecord)
+ }
+ if !ok || cr.Type != CacheRecordTypeSymlink {
+ currentPath = nextPath
+ continue
+ }
+
+ linksWalked++
+ if linksWalked > maxSymlinkLimit {
+ return nil, nil, errTooManyLinks
+ }
+
+ remainingPath = cr.Linkname + "/" + remainingPath
+ if path.IsAbs(cr.Linkname) {
+ currentPath = "/"
+ }
  }
- link := path.Clean(linkname)
- if !path.IsAbs(link) {
- return path.Join("/", path.Join(path.Dir(dirPath), link))
+ // We've already looked up the final component. However, if there was a
+ // trailing slash in the original path, we need to do the lookup again with
+ // the slash applied.
+ if hadTrailingSlash {
+ cr = nil
+ currentPath += "/"
+ v, ok := root.Get(convertPathToKey([]byte(currentPath)))
+ if ok {
+ cr = v.(*CacheRecord)
+ }
  }
- return link
+ return convertPathToKey([]byte(currentPath)), cr, nil
 }
 
 func prepareDigest(fp, p string, fi os.FileInfo) (digest.Digest, error) {