Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c30a72a
feat(parser): 新增document的csv文件parser接口实现
CXeon Nov 12, 2025
3326fe1
Merge branch 'main' into feat/document-csv-parser
CXeon Nov 14, 2025
6a52e2b
Merge branch 'main' into feat/document-csv-parser
CXeon Nov 17, 2025
d0fb93d
Merge branch 'main' into feat/document-csv-parser
CXeon Nov 19, 2025
4f45815
Merge branch 'main' into feat/document-csv-parser
CXeon Nov 21, 2025
b7fd466
Merge branch 'main' into feat/document-csv-parser
hi-pender Nov 21, 2025
504a748
Merge branch 'main' into feat/document-csv-parser
hi-pender Nov 24, 2025
98eafd8
fix(csv_parser):resolved an incorrect package import for context and …
CXeon Dec 3, 2025
2c978a1
Merge branch 'cloudwego:main' into feat/document-csv-parser
CXeon Dec 3, 2025
163e5b5
Merge branch 'feat/document-csv-parser' of github.com:CXeon/eino-ext …
CXeon Dec 3, 2025
fb1f058
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 4, 2025
ff9fc9d
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 5, 2025
ebc9940
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 5, 2025
1f0a7da
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 5, 2025
31f9be3
Merge branch 'main' into feat/document-csv-parser
hi-pender Dec 5, 2025
6b637ff
Merge branch 'main' into feat/document-csv-parser
hi-pender Dec 5, 2025
14584e2
feat(parser): ddd copyright and license information
hi-pender Dec 5, 2025
b032f30
feat(parser): add copyright and license to csv_parser_test.go
hi-pender Dec 5, 2025
7600202
Merge branch 'main' of github.com:CXeon/eino-ext into feat/document-c…
CXeon Dec 8, 2025
20757f1
feat(csv_parser): add README.md file and examples directory.
CXeon Dec 8, 2025
5e5da67
Merge branch 'feat/document-csv-parser' of github.com:CXeon/eino-ext …
CXeon Dec 8, 2025
eb0f369
style(readme): Removed the description of the certificate.
CXeon Dec 8, 2025
00d0c71
style(readme): Add LICENSE.
CXeon Dec 8, 2025
34d5ae4
style(readme): Add license header in main.
CXeon Dec 8, 2025
13c697d
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 8, 2025
fac519c
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 9, 2025
ef1824b
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 11, 2025
13f863c
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 12, 2025
31c3008
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 14, 2025
bf7253c
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 16, 2025
5bc4b76
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 18, 2025
de4cd51
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 19, 2025
ffde69f
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 22, 2025
9c285ba
Merge branch 'main' into feat/document-csv-parser
CXeon Dec 30, 2025
4b716c7
Merge branch 'main' into feat/document-csv-parser
CXeon Jan 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions components/document/parser/csv/csv_parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package csv

import (
"encoding/csv"
"fmt"
"io"
"strings"

"github.com/cloudwego/eino/components/document/parser"
"github.com/cloudwego/eino/schema"
"golang.org/x/net/context"
)

const (
MetaDataRow = "_row"
MetaDataExt = "_ext"
)

// CsvParser parses CSV content from io.Reader.
type CsvParser struct {
Config *Config
}

// Config Used to configure CsvParser.
type Config struct {
// NoHeader is set to false by default, which means that the first row is used as the table header
NoHeader bool
// IDPrefix is set to customize the prefix of document ID, default 1,2,3, ...
IDPrefix string
// Comma is set to ',' by default, which means that the comma is used as the field delimiter
Comma rune
// Comment is set to '#' by default, which means that the '#' character is used as the comment character
Comment rune
}

// NewCsvParser creates a new CsvParser
func NewCsvParser(ctx context.Context, config *Config) (cp *CsvParser, err error) {
if config == nil {
config = &Config{}

}
if config.Comma == 0 {
config.Comma = rune(',')
}
if config.Comment == 0 {
config.Comment = rune('#')
}

cp = &CsvParser{Config: config}
return cp, nil
}

// generateID generates document ID based on configuration
func (cp *CsvParser) generateID(i int) string {
if cp.Config.IDPrefix == "" {
return fmt.Sprintf("%d", i)
}
return fmt.Sprintf("%s%d", cp.Config.IDPrefix, i)
}

func (cp *CsvParser) buildRowMetaData(row []string, headers []string) map[string]any {
metaData := make(map[string]any)
if !cp.Config.NoHeader {
for j, header := range headers {
if j < len(row) {
metaData[header] = row[j]
}
}
}
return metaData
}

func (cp *CsvParser) Parse(ctx context.Context, reader io.Reader, opts ...parser.Option) ([]*schema.Document, error) {
option := parser.GetCommonOptions(&parser.Options{}, opts...)

csvFile := csv.NewReader(reader)

// get all rows
rows, err := csvFile.ReadAll()
if err != nil {
return nil, err
}
if len(rows) == 0 {
return nil, nil
}

var ret []*schema.Document

// Process the header
startIdx := 0
var headers []string
if !cp.Config.NoHeader && len(rows) > 0 {
headers = rows[0]
startIdx = 1
}

// Process rows of data
for i := startIdx; i < len(rows); i++ {
row := rows[i]
if len(row) == 0 {
continue
}
// Convert row data to strings
contentParts := make([]string, len(row))
for j, cell := range row {
contentParts[j] = strings.TrimSpace(cell)
}
content := strings.Join(contentParts, string(cp.Config.Comma))

meta := make(map[string]any)

// Build the row's Meta
rowMeta := cp.buildRowMetaData(row, headers)
meta[MetaDataRow] = rowMeta

// Get the Common ExtraMeta
if option.ExtraMeta != nil {
meta[MetaDataExt] = option.ExtraMeta
}

// Create New Document
nDoc := &schema.Document{
ID: cp.generateID(i),
Content: content,
MetaData: meta,
}

ret = append(ret, nDoc)
}

return ret, nil
}
38 changes: 38 additions & 0 deletions components/document/parser/csv/csv_parser_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package csv

import (
"os"
"testing"

"github.com/cloudwego/eino/components/document/parser"
"golang.org/x/net/context"
)

func TestCsvParser(t *testing.T) {
f, err := os.Open("./test.csv")
if err != nil {
t.Error(err)
return
}
defer f.Close()

ctx := context.Background()
cp, err := NewCsvParser(ctx, &Config{})
if err != nil {
t.Error(err)
return
}

docs, err := cp.Parse(ctx, f, parser.WithURI("local"), parser.WithExtraMeta(map[string]any{
"_extension": ".csv",
"_file_name": "test.csv",
"_source": "local",
}))

if err != nil {
t.Error(err)
return
}
t.Log(docs)
return
}
47 changes: 47 additions & 0 deletions components/document/parser/csv/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
module github.com/cloudwego/eino-ext/components/document/parser/csv

go 1.24.0

toolchain go1.24.4

require (
github.com/cloudwego/eino v0.5.13
golang.org/x/net v0.47.0
)

require (
github.com/bahlo/generic-list-go v0.2.0 // indirect
github.com/buger/jsonparser v1.1.1 // indirect
github.com/bytedance/gopkg v0.1.3 // indirect
github.com/bytedance/sonic v1.14.1 // indirect
github.com/bytedance/sonic/loader v0.3.0 // indirect
github.com/cloudwego/base64x v0.1.6 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/eino-contrib/jsonschema v1.0.2 // indirect
github.com/getkin/kin-openapi v0.118.0 // indirect
github.com/go-openapi/jsonpointer v0.19.5 // indirect
github.com/go-openapi/swag v0.19.5 // indirect
github.com/goph/emperror v0.17.2 // indirect
github.com/invopop/yaml v0.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/cpuid/v2 v2.2.9 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
github.com/nikolalohinski/gonja v1.5.3 // indirect
github.com/pelletier/go-toml/v2 v2.0.9 // indirect
github.com/perimeterx/marshmallow v1.1.4 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/slongfield/pyfmt v0.0.0-20220222012616-ea85ff4c361f // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
github.com/yargevad/filepathx v1.0.0 // indirect
golang.org/x/arch v0.11.0 // indirect
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect
golang.org/x/sys v0.38.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
Loading
Loading