-
Hi, What do you think is the best way to load large files ?
Maybe I misunderstood this but I think that the library pushes us to make a parser that works on the whole file at once but I don't think it's the right approach for large files. The issue is that for a parser to work in a continuous consuming mode it has to be thought out this way. Should Thanks, FWIW I did build something to reduce this issue in my project.It's a protocol that represents the public protocol ProgressiveParsing {
associatedtype PartialType : Codable
associatedtype CompleteType : Codable
static func parser(inputFileURL: Foundation.URL, cacheKey: String?, loadFromCache: Bool, urlSession: URLSession) -> AsyncThrowingStream<Progression<CompleteType, PartialType>, Error>
}
public enum Progression<CompleteType: Codable, PartialType: Codable> {
case start
case parsingProgress(current: Int, total: Int, partial: PartialType)
case fullFile(CompleteType, fromCache: Bool)
} The public protocol ProgressiveParsing {
associatedtype PartialType : Codable
associatedtype CompleteType : Codable
static func parser(inputFileURL: Foundation.URL, cacheKey: String?, loadFromCache: Bool, urlSession: URLSession) -> AsyncThrowingStream<Progression<CompleteType, PartialType>, Error>
}
public enum Progression<CompleteType: Codable, PartialType: Codable> {
case start
case parsingProgress(current: Int, total: Int, partial: PartialType)
case fullFile(CompleteType, fromCache: Bool)
} FileConsumer can:
import Foundation
public actor FileConsumer {
public enum ProgressState<Partial: Codable, Complete: Codable> {
case start
case progress(current: Int, total: Int, lineNbr: Int, lineLenght: Int, generated: [Partial])
case finished(Complete, fromCache: Bool)
}
public func saveInCache<Complete: Codable>(url: URL, cacheKey: String? = nil, _ toCache: Complete) async throws
public func downloadThenParse<Complete: Codable>(
url: URL,
cacheOutput: Complete.Type,
cacheKey: String? = nil,
loadFromCache: Bool = true,
urlSession: URLSession = .shared,
parse: @escaping (inout Substring.UTF8View) throws -> Complete
) -> AsyncThrowingStream<ProgressState<Complete, Complete>, Error>
public func downloadAndParse<Partial: Codable, Complete: Codable>(
url: URL,
cacheOutput: Complete.Type,
cacheKey: String? = nil,
loadFromCache: Bool = true,
useFullLines: Bool = true,
lineIteration: Int = 1,
urlSession: URLSession = .shared,
parse: @escaping (inout Substring.UTF8View) throws -> [Partial]
) -> AsyncThrowingStream<ProgressState<Partial, Complete>, Error>
} |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 4 replies
-
@mackoj We'd love to enhance parsing to work well with async sequences, but haven't figured out an approach that quite generically fits into the existing library. If you explore any promising approaches we'd love to hear about them! |
Beta Was this translation helpful? Give feedback.
-
Hi @stephencelis, I did arrive at an API that I find simple and that integrates pretty well with the actual API of swift-parsing. /* AsyncThrowingStream<ParserStreamProgression<Output>, Error> */
for try await parsed in someParser.stream(inputFileURL: inputFileURL) {
switch parsed {
case .start:
print("The parsing processing process has started")
case let .progress(current, total, parsed):
print("Have found \(parsed.count) elements.")
case let .finish(parsed):
print("Have found \(parsed.count) elements.")
}
} The
If you chose to use the Below there is an example implementation of it, it is not perfect but it provides an idea of the feasibility. Rest of the codeimport Foundation
import Parsing
extension Parser {
public func stream(
inputFileURL url: Foundation.URL,
transform: Transform<Input>,
download: SteamDownloadProcess = .downloadAndParse(useFullLines: true, lineIteration: 1),
urlSession: URLSession = .shared
) -> AsyncThrowingStream<ParserStreamProgression<Output>, Error> where Output: Codable {
return AsyncThrowingStream { continuation in
continuation.yield(.start)
switch download {
case .downloadThenParse:
var filePath: URL = url
if url.isFileURL == false {
let request = URLRequest(url: url)
let (fpath, _) = try await urlSession.download(for: request)
filePath = fpath
}
let fileContent = try String(contentsOf: filePath) // this need to be improved
var tmp = transform.apply(fileContent)
let parsed = try parse(&tmp)
continuation.yield(.finish(parsed))
case let .downloadAndParse(useFullLines, lineIteration):
var current: Int = 0
let request = URLRequest(url: url)
let (bytes, response) = try await urlSession.bytes(for: request)
let total = Int(response.expectedContentLength)
var accumulateur: String = ""
for try await line in bytes.computedLines(useFullLines, lineIteration) {
current += Int(line.lengthOfBytes(using: .utf8))
accumulateur.append(line)
var subString = transform.apply(accumulateur)
let parsedLine: Output = try parse(&subString)
continuation.yield(.progress(current: current, total: total, parsed: parsedLine))
accumulateur = transform.unapply(subString)
}
}
continuation.finish()
}
}
}
extension Parser where Input == Substring.UTF8View {
public func stream(
inputFileURL url: Foundation.URL,
download: SteamDownloadProcess = .downloadAndParse(useFullLines: true, lineIteration: 1),
urlSession: URLSession = .shared
) -> AsyncThrowingStream<ParserStreamProgression<Output>, Error> where Output: Codable {
return stream(inputFileURL: url, transform: .utf8, download: download, urlSession: urlSession)
}
}
public enum SteamDownloadProcess {
case downloadThenParse
case downloadAndParse(useFullLines: Bool/*true: means don't consume new lines*/, lineIteration: Int/* number of lines send to the parser */)
}
public enum ParserStreamProgression<Output: Codable> {
case start
case progress(current: Int, total: Int, parsed: Output)
case finish(Output)
}
import Foundation
extension URLSession.AsyncBytes {
/// This works like lines but with CR and and empty lines in the buffer
/// When the file contain a line like "Hello World\n\nI'm Alive"
/// Instead of sending "Hello World" like .line api it will send "Hello World\n" then "\n" then "I'm Alive"
public func computedLines(_ useFullline: Bool = false, _ nbr: Int = 1) -> AsyncThrowingStream<String, Error> {
return AsyncThrowingStream { continuation in
var acc: String = ""
var nbLine = 0
for try await char in self.characters {
if char == "\n" {
nbLine += 1
}
if char == "\n", nbLine == nbr {
continuation.yield(acc + (useFullline == true ? "\n" : ""))
acc = ""
nbLine = 0
} else {
acc.append(char)
}
}
if acc.isEmpty == false {
continuation.yield(acc)
}
continuation.finish()
}
}
}
import Foundation
public struct Transform<Input> {
let apply: (_ input: String) -> Input
let unapply: (_ input: Input) -> String
init(apply: @escaping (_ input: String) -> Input, unapply: @escaping (_ input: Input) -> String) {
self.apply = apply
self.unapply = unapply
}
}
extension Transform where Input == Substring.UTF8View {
public static var utf8 = Transform(apply: { $0[...].utf8 }, unapply: { String($0)! })
}
extension AsyncThrowingStream where Failure == Error {
public init(_ build: @escaping (Continuation) async throws -> Void) {
self.init { continuation in
Task {
do {
try await build(continuation)
continuation.finish()
} catch {
continuation.finish(throwing: error)
}
}
}
}
} |
Beta Was this translation helpful? Give feedback.
Hi @stephencelis,
I did arrive at an API that I find simple and that integrates pretty well with the actual API of swift-parsing.
Instead of calling
.parse
on a parser we can call.steam
it return anAsyncThrowingStream<ParserStreamProgression<Output>, Error>
.The
steam
function is c…