Skip to content

Commit d3eb140

Browse files
committed
magika: add ml-based content type detection
Signed-off-by: Hank Donnay <[email protected]>
1 parent 49f1591 commit d3eb140

File tree

9 files changed

+1501
-0
lines changed

9 files changed

+1501
-0
lines changed

detector/magika/_cmd/ortgen/go.mod

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
module ortgen
2+
3+
go 1.24.3
4+
5+
require modernc.org/cc/v4 v4.26.1
6+
7+
require (
8+
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
9+
modernc.org/mathutil v1.7.1 // indirect
10+
modernc.org/opt v0.1.4 // indirect
11+
modernc.org/sortutil v1.2.1 // indirect
12+
modernc.org/strutil v1.2.1 // indirect
13+
modernc.org/token v1.1.0 // indirect
14+
)

detector/magika/_cmd/ortgen/go.sum

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
2+
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
3+
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
4+
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
5+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
6+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
7+
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
8+
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
9+
modernc.org/cc/v4 v4.26.1 h1:+X5NtzVBn0KgsBCBe+xkDC7twLb/jNVj9FPgiwSQO3s=
10+
modernc.org/cc/v4 v4.26.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
11+
modernc.org/ccorpus2 v1.5.2 h1:Ui+4tc58mf/W+2arcYCJR903y3zl3ecsI7Fpaaqozyw=
12+
modernc.org/ccorpus2 v1.5.2/go.mod h1:Wifvo4Q/qS/h1aRoC2TffcHsnxwTikmi1AuLANuucJQ=
13+
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
14+
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
15+
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
16+
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
17+
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
18+
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
19+
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
20+
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
21+
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
22+
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
package main
2+
3+
import (
4+
"archive/tar"
5+
"bufio"
6+
"bytes"
7+
"compress/gzip"
8+
"context"
9+
"flag"
10+
"fmt"
11+
"io"
12+
"io/fs"
13+
"net/http"
14+
"os"
15+
"path/filepath"
16+
"strings"
17+
18+
"modernc.org/cc/v4"
19+
)
20+
21+
const tarURL = "https://github.com/microsoft/onnxruntime/releases/download/v1.15.1/onnxruntime-linux-x64-1.15.1.tgz"
22+
23+
func main() {
24+
var code int
25+
defer func() {
26+
if code != 0 {
27+
os.Exit(code)
28+
}
29+
}()
30+
var out *os.File
31+
in := flag.String("url", tarURL, "fetch ONNX Runtime release tarball from `URL`")
32+
pkg := flag.String("pacakge", "magika", "generated package `name`")
33+
flag.Func("o", "output to `file`", func(p string) error {
34+
if out != nil {
35+
if err := out.Close(); err != nil {
36+
return err
37+
}
38+
}
39+
f, err := os.Create(p)
40+
if err != nil {
41+
return err
42+
}
43+
out = f
44+
return nil
45+
})
46+
flag.Parse()
47+
48+
if out == nil {
49+
out = os.Stdout
50+
}
51+
defer out.Close()
52+
53+
ctx := context.Background()
54+
if err := Main(ctx, out, *in, *pkg); err != nil {
55+
fmt.Fprintln(os.Stderr, err)
56+
code = 1
57+
}
58+
}
59+
60+
const genHeader = `// Code generated by ortgen. DO NOT EDIT.`
61+
62+
func Main(ctx context.Context, out io.Writer, in, pkg string) error {
63+
const header = `onnxruntime_c_api.h`
64+
dir, err := fetchTarball(ctx, in)
65+
if err != nil {
66+
return err
67+
}
68+
defer os.RemoveAll(dir.Name())
69+
defer dir.Close()
70+
71+
cfg, err := cc.NewConfig("linux", "amd64")
72+
if err != nil {
73+
return err
74+
}
75+
ms, _ := fs.Glob(dir.FS(), `*/include/`+header)
76+
f, err := dir.Open(ms[0])
77+
if err != nil {
78+
return err
79+
}
80+
src := []cc.Source{
81+
{Name: "<predefined>", Value: cfg.Predefined},
82+
{Name: "<builtin>", Value: cc.Builtin},
83+
{Name: header, Value: f},
84+
}
85+
ast, err := cc.Translate(cfg, src)
86+
if err != nil {
87+
return err
88+
}
89+
90+
fmt.Fprintf(out, "%s\n\npackage %s\n\nimport \"structs\"\n", genHeader, pkg)
91+
// Walk the list, looking for the bits we want:
92+
var buf bytes.Buffer
93+
for cur := ast.TranslationUnit; cur != nil; cur = cur.TranslationUnit {
94+
buf.Reset()
95+
if cur.ExternalDeclaration.Case != cc.ExternalDeclarationDecl {
96+
continue
97+
}
98+
decl := cur.ExternalDeclaration.Declaration
99+
if decl.Position().Filename != header {
100+
continue
101+
}
102+
spec := decl.DeclarationSpecifiers
103+
if spec == nil || spec.Case != cc.DeclarationSpecifiersTypeSpec {
104+
continue
105+
}
106+
ty := spec.TypeSpecifier
107+
if ty == nil || ty.Case != cc.TypeSpecifierStructOrUnion {
108+
continue
109+
}
110+
structSpec := ty.StructOrUnionSpecifier
111+
if structSpec == nil ||
112+
structSpec.Case != cc.StructOrUnionSpecifierDef ||
113+
structSpec.StructOrUnion.Case != cc.StructOrUnionStruct {
114+
continue
115+
}
116+
117+
n := structSpec.Token.SrcStr()
118+
switch n {
119+
case "OrtApi":
120+
case "OrtApiBase":
121+
default:
122+
continue
123+
}
124+
fmt.Fprintf(&buf, "\ntype %s struct {\n\t_ structs.HostLayout\n\n", strings.Replace(n, "O", "o", 1))
125+
126+
for cur := structSpec.StructDeclarationList; cur != nil; cur = cur.StructDeclarationList {
127+
buf.WriteString("\t// ")
128+
buf.WriteString(cc.NodeSource(cur.StructDeclaration))
129+
buf.WriteByte('\n')
130+
131+
// pull out the function pointer ident:
132+
decl := cur.StructDeclaration.StructDeclaratorList.StructDeclarator.Declarator.DirectDeclarator.DirectDeclarator.Declarator.DirectDeclarator
133+
buf.WriteByte('\t')
134+
buf.Write(decl.Token.Src())
135+
buf.WriteString(" uintptr\n")
136+
}
137+
138+
buf.WriteString("}\n")
139+
if _, err := io.Copy(out, &buf); err != nil {
140+
return err
141+
}
142+
}
143+
144+
return nil
145+
}
146+
147+
func fetchTarball(ctx context.Context, in string) (*os.Root, error) {
148+
res, err := http.Get(in)
149+
if err != nil {
150+
return nil, err
151+
}
152+
if res.StatusCode != http.StatusOK {
153+
return nil, fmt.Errorf("unexpected status: %v", res.Status)
154+
}
155+
defer res.Body.Close()
156+
z, err := gzip.NewReader(res.Body)
157+
if err != nil {
158+
return nil, err
159+
}
160+
161+
d, err := os.MkdirTemp("", "ortgen.")
162+
if err != nil {
163+
return nil, err
164+
}
165+
root, err := os.OpenRoot(d)
166+
if err != nil {
167+
return nil, err
168+
}
169+
170+
ok := false
171+
defer func() {
172+
if !ok {
173+
os.RemoveAll(d)
174+
root.Close()
175+
}
176+
}()
177+
178+
rd := tar.NewReader(bufio.NewReader(z))
179+
h, err := rd.Next()
180+
for ; err == nil; h, err = rd.Next() {
181+
err := func() error {
182+
p := filepath.Join(".", h.Name)
183+
fi := h.FileInfo()
184+
if fi.IsDir() {
185+
return root.Mkdir(p, 0o755)
186+
}
187+
f, err := root.Create(p)
188+
if err != nil {
189+
return err
190+
}
191+
defer f.Close()
192+
193+
if _, err := io.Copy(f, rd); err != nil {
194+
return err
195+
}
196+
return nil
197+
}()
198+
if err != nil {
199+
return nil, err
200+
}
201+
}
202+
203+
ok = true
204+
return root, nil
205+
}

0 commit comments

Comments
 (0)