Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix glob performance on large monorepos #1244

Merged
merged 4 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/Spago/Command/Build.purs
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,9 @@ run opts = do
if Array.null pedanticPkgs || opts.depsOnly then
pure true
else do
logInfo $ "Looking for unused and undeclared transitive dependencies..."
logInfo "Looking for unused and undeclared transitive dependencies..."
eitherGraph <- Graph.runGraph globs opts.pursArgs
logDebug "Decoded the output of `purs graph` successfully. Analyzing dependencies..."
eitherGraph # either (prepareToDie >>> (_ $> false)) \graph -> do
env <- ask
checkResults <- map Array.fold $ for pedanticPkgs \(Tuple selected options) -> do
Expand Down
3 changes: 1 addition & 2 deletions src/Spago/Glob.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import mm from 'micromatch';
import * as fsWalk from '@nodelib/fs.walk';

export const testGlob = glob => mm.matcher(glob.include, {ignore: glob.ignore});
export const testGlob = glob => mm.matcher(glob.include, { ignore: glob.ignore });

export const fsWalkImpl = Left => Right => respond => options => path => () => {
const entryFilter = entry => options.entryFilter(entry)();
Expand All @@ -13,4 +13,3 @@ export const fsWalkImpl = Left => Right => respond => options => path => () => {
};

export const isFile = dirent => dirent.isFile();

98 changes: 60 additions & 38 deletions src/Spago/Glob.purs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@ import Control.Monad.Maybe.Trans (runMaybeT)
import Control.Monad.Trans.Class (lift)
import Data.Array as Array
import Data.Filterable (filter)
import Data.Foldable (any, fold)
import Data.Foldable (any, traverse_)
import Data.String as String
import Data.Traversable (traverse_)
import Data.String as String.CodePoint
import Effect.Aff as Aff
import Effect.Ref as Ref
import Node.FS.Sync as SyncFS
import Node.Path as Path
import Record as Record
import Type.Proxy (Proxy(..))

type Glob =
{ ignore :: Array String
Expand All @@ -30,8 +28,10 @@ splitGlob { ignore, include } = (\a -> { ignore, include: [ a ] }) <$> include
type Entry = { name :: String, path :: String, dirent :: DirEnt }
type FsWalkOptions = { entryFilter :: Entry -> Effect Boolean, deepFilter :: Entry -> Effect Boolean }

-- https://nodejs.org/api/fs.html#class-fsdirent
foreign import data DirEnt :: Type
foreign import isFile :: DirEnt -> Boolean

foreign import fsWalkImpl
:: (forall a b. a -> Either a b)
-> (forall a b. b -> Either a b)
Expand All @@ -40,32 +40,32 @@ foreign import fsWalkImpl
-> String
-> Effect Unit

gitignoreGlob :: String -> String -> Glob
gitignoreGlob base =
gitignoreFileToGlob :: FilePath -> String -> Glob
gitignoreFileToGlob base =
String.split (String.Pattern "\n")
>>> map String.trim
>>> Array.filter (not <<< or [ String.null, isComment ])
>>> partitionMap
( \line -> do
let
resolve a = Path.concat [ base, a ]
pat a = withForwardSlashes $ resolve $ unpackPattern a
let pattern lin = withForwardSlashes $ Path.concat [ base, gitignorePatternToGlobPattern lin ]
case String.stripPrefix (String.Pattern "!") line of
Just negated -> Left $ pat negated
Nothing -> Right $ pat line
Just negated -> Left $ pattern negated
Nothing -> Right $ pattern line
)
>>> Record.rename (Proxy @"left") (Proxy @"ignore")
>>> Record.rename (Proxy @"right") (Proxy @"include")
>>> (\{ left, right } -> { ignore: left, include: right })

where
isComment = isJust <<< String.stripPrefix (String.Pattern "#")
leadingSlash = String.stripPrefix (String.Pattern "/")
trailingSlash = String.stripSuffix (String.Pattern "/")
dropSuffixSlash str = fromMaybe str $ String.stripSuffix (String.Pattern "/") str
dropPrefixSlash str = fromMaybe str $ String.stripPrefix (String.Pattern "/") str

leadingSlash str = String.codePointAt 0 str == Just (String.CodePoint.codePointFromChar '/')
trailingSlash str = String.codePointAt (String.length str - 1) str == Just (String.CodePoint.codePointFromChar '/')

unpackPattern :: String -> String
unpackPattern pattern
| Just a <- trailingSlash pattern = unpackPattern a
| Just a <- leadingSlash pattern = a <> "/**"
gitignorePatternToGlobPattern :: String -> String
gitignorePatternToGlobPattern pattern
| trailingSlash pattern = gitignorePatternToGlobPattern $ dropSuffixSlash pattern
| leadingSlash pattern = dropPrefixSlash pattern <> "/**"
| otherwise = "**/" <> pattern <> "/**"

fsWalk :: String -> Array String -> Array String -> Aff (Array Entry)
Expand All @@ -74,41 +74,63 @@ fsWalk cwd ignorePatterns includePatterns = Aff.makeAff \cb -> do

-- Pattern for directories which can be outright ignored.
-- This will be updated whenver a .gitignore is found.
ignoreMatcherRef :: Ref Glob <- Ref.new { ignore: [], include: ignorePatterns }
ignoreMatcherRef :: Ref (String -> Boolean) <- Ref.new (testGlob { ignore: [], include: ignorePatterns })
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why this never occurred to me before but if we always boolean OR || the matcher functions could we not accumulate into an Array (String -> Boolean) and flatten at the end with ||, avoiding the stack unsafety?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this what we are doing now with the double or (line 103 and line 115)?

Copy link
Contributor

@cakekindel cakekindel Jul 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessarily, because or (_ :: Array (_ -> Boolean)) still uses HeytingAlgebra (a -> Boolean), which doesn't get TCOd consistently / at all

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trying this out on purescm/purescript-core and seeing a negligible difference. I'll keep it around in case someone opens a stack overflow issue again (or I can open the PR anyway)

https://github.com/purescript/spago/compare/master...cakekindel:fix-glob-perf?expand=1

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see - if the performance is comparable and we can make it stack safe then that's a welcome addition


-- If this Ref contains `true` because this Aff has been canceled, then deepFilter will always return false.
canceled <- Ref.new false

let
entryGitignore :: Entry -> Effect Unit
entryGitignore entry =
try (SyncFS.readTextFile UTF8 entry.path)
>>= traverse_ \gitignore ->
let
base = Path.relative cwd $ Path.dirname entry.path
glob = gitignoreGlob base gitignore
pats = splitGlob glob
patOk g = not $ any (testGlob g) includePatterns
newPats = filter patOk pats
in
void $ Ref.modify (_ <> fold newPats) $ ignoreMatcherRef
-- Update the ignoreMatcherRef with the patterns from a .gitignore file
updateIgnoreMatcherWithGitignore :: Entry -> Effect Unit
updateIgnoreMatcherWithGitignore entry = do
let
gitignorePath = entry.path
-- directory of this .gitignore relative to the directory being globbed
base = Path.relative cwd (Path.dirname gitignorePath)

try (SyncFS.readTextFile UTF8 entry.path) >>= traverse_ \gitignore -> do
let
gitignored = testGlob <$> (splitGlob $ gitignoreFileToGlob base gitignore)

-- Do not add `.gitignore` patterns that explicitly ignore the files
-- we're searching for;
--
-- ex. if `includePatterns` is [".spago/p/aff-1.0.0/**/*.purs"],
-- and `gitignored` is ["node_modules", ".spago"],
-- then add "node_modules" to `ignoreMatcher` but not ".spago"
wouldConflictWithSearch matcher = any matcher includePatterns

newMatchers = or $ filter (not <<< wouldConflictWithSearch) gitignored

-- Another possible approach could be to keep a growing array of patterns and
-- regenerate the matcher on every gitignore. We have tried that (see #1234),
-- and turned out to be 2x slower. (see #1242, and #1244)
-- Composing functions is faster, but there's the risk of blowing the stack
-- (see #1231) - when this was introduced in #1210, every match from the
-- gitignore file would be `or`ed to the previous matcher, which would create
-- a very long (linear) call chain - in this latest iteration we are `or`ing the
-- new matchers together, then the whole thing with the previous matcher.
-- This is still prone to stack issues, but we now have a tree so it should
-- not be as dramatic.
addMatcher currentMatcher = or [ currentMatcher, newMatchers ]

Ref.modify_ addMatcher ignoreMatcherRef

-- Should `fsWalk` recurse into this directory?
deepFilter :: Entry -> Effect Boolean
deepFilter entry = fromMaybe false <$> runMaybeT do
isCanceled <- lift $ Ref.read canceled
guard $ not isCanceled
shouldIgnore <- lift $ testGlob <$> Ref.read ignoreMatcherRef
shouldIgnore <- lift $ Ref.read ignoreMatcherRef
pure $ not $ shouldIgnore $ Path.relative cwd entry.path

-- Should `fsWalk` retain this entry for the result array?
entryFilter :: Entry -> Effect Boolean
entryFilter entry = do
when (isFile entry.dirent && entry.name == ".gitignore") (entryGitignore entry)
ignorePat <- Ref.read ignoreMatcherRef
let
ignoreMatcher = testGlob ignorePat
path = withForwardSlashes $ Path.relative cwd entry.path
when (isFile entry.dirent && entry.name == ".gitignore") do
updateIgnoreMatcherWithGitignore entry
ignoreMatcher <- Ref.read ignoreMatcherRef
let path = withForwardSlashes $ Path.relative cwd entry.path
pure $ includeMatcher path && not (ignoreMatcher path)

options = { entryFilter, deepFilter }
Expand Down
8 changes: 6 additions & 2 deletions test/Spago/Glob.purs
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,14 @@ spec = Spec.around globTmpDir do

Spec.it "is stacksafe" \p -> do
let
chars = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k" ]
-- 10,000-line gitignore
chars = [ "a", "b", "c", "d", "e", "f", "g", "h" ]
-- 4000-line gitignore
words = [ \a b c d -> a <> b <> c <> d ] <*> chars <*> chars <*> chars <*> chars
hugeGitignore = intercalate "\n" words
-- Write it in a few places
FS.writeTextFile (Path.concat [ p, ".gitignore" ]) hugeGitignore
FS.writeTextFile (Path.concat [ p, "fruits", ".gitignore" ]) hugeGitignore
FS.writeTextFile (Path.concat [ p, "fruits", "left", ".gitignore" ]) hugeGitignore
FS.writeTextFile (Path.concat [ p, "fruits", "right", ".gitignore" ]) hugeGitignore
a <- Glob.gitignoringGlob p [ "fruits/**/apple" ]
Array.sort a `Assert.shouldEqual` [ "fruits/left/apple", "fruits/right/apple" ]
Loading