Skip to content

Commit 9254afe

Browse files
authored
Merge pull request #42 from jsonjoy-com/copilot/fix-41
feat: implement Apache Avro encoder with schema validation
2 parents 1a42e8b + b2ab5bf commit 9254afe

File tree

7 files changed

+2124
-0
lines changed

7 files changed

+2124
-0
lines changed

src/avro/AvroEncoder.ts

Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
import type {IWriter, IWriterGrowable} from '@jsonjoy.com/util/lib/buffers';
2+
import type {BinaryJsonEncoder} from '../types';
3+
4+
/**
5+
* Apache Avro binary encoder for basic value encoding.
6+
* Implements the Avro binary encoding specification without schema validation.
7+
* Based on https://avro.apache.org/docs/1.12.0/specification/
8+
*/
9+
export class AvroEncoder implements BinaryJsonEncoder {
10+
constructor(public readonly writer: IWriter & IWriterGrowable) {}
11+
12+
public encode(value: unknown): Uint8Array {
13+
const writer = this.writer;
14+
writer.reset();
15+
this.writeAny(value);
16+
return writer.flush();
17+
}
18+
19+
/**
20+
* Called when the encoder encounters a value that it does not know how to encode.
21+
*/
22+
public writeUnknown(value: unknown): void {
23+
this.writeNull();
24+
}
25+
26+
public writeAny(value: unknown): void {
27+
switch (typeof value) {
28+
case 'boolean':
29+
return this.writeBoolean(value);
30+
case 'number':
31+
return this.writeNumber(value);
32+
case 'string':
33+
return this.writeStr(value);
34+
case 'object': {
35+
if (value === null) return this.writeNull();
36+
const constructor = value.constructor;
37+
switch (constructor) {
38+
case Object:
39+
return this.writeObj(value as Record<string, unknown>);
40+
case Array:
41+
return this.writeArr(value as unknown[]);
42+
case Uint8Array:
43+
return this.writeBin(value as Uint8Array);
44+
default:
45+
return this.writeUnknown(value);
46+
}
47+
}
48+
case 'bigint':
49+
return this.writeLong(value);
50+
case 'undefined':
51+
return this.writeNull();
52+
default:
53+
return this.writeUnknown(value);
54+
}
55+
}
56+
57+
/**
58+
* Writes an Avro null value.
59+
*/
60+
public writeNull(): void {
61+
// Null values are encoded as zero bytes
62+
}
63+
64+
/**
65+
* Writes an Avro boolean value.
66+
*/
67+
public writeBoolean(bool: boolean): void {
68+
this.writer.u8(bool ? 1 : 0);
69+
}
70+
71+
/**
72+
* Writes an Avro int value using zigzag encoding.
73+
*/
74+
public writeInt(int: number): void {
75+
this.writeVarIntSigned(this.encodeZigZag32(Math.trunc(int)));
76+
}
77+
78+
/**
79+
* Writes an Avro long value using zigzag encoding.
80+
*/
81+
public writeLong(long: number | bigint): void {
82+
if (typeof long === 'bigint') {
83+
this.writeVarLong(this.encodeZigZag64(long));
84+
} else {
85+
this.writeVarLong(this.encodeZigZag64(BigInt(Math.trunc(long))));
86+
}
87+
}
88+
89+
/**
90+
* Writes an Avro float value using IEEE 754 single-precision.
91+
*/
92+
public writeFloatAvro(float: number): void {
93+
const writer = this.writer;
94+
writer.ensureCapacity(4);
95+
writer.view.setFloat32(writer.x, float, true); // little-endian
96+
writer.move(4);
97+
}
98+
99+
/**
100+
* Writes an Avro double value using IEEE 754 double-precision.
101+
*/
102+
public writeDouble(double: number): void {
103+
const writer = this.writer;
104+
writer.ensureCapacity(8);
105+
writer.view.setFloat64(writer.x, double, true); // little-endian
106+
writer.move(8);
107+
}
108+
109+
/**
110+
* Writes an Avro bytes value with length-prefixed encoding.
111+
*/
112+
public writeBin(bytes: Uint8Array): void {
113+
this.writeVarIntUnsigned(bytes.length);
114+
this.writer.buf(bytes, bytes.length);
115+
}
116+
117+
/**
118+
* Writes an Avro string value with UTF-8 encoding and length prefix.
119+
*/
120+
public writeStr(str: string): void {
121+
const writer = this.writer;
122+
const maxSize = str.length * 4; // Max UTF-8 bytes for string
123+
writer.ensureCapacity(5 + maxSize); // 5 bytes max for varint length
124+
125+
// Reserve space for length (we'll come back to fill this)
126+
const lengthOffset = writer.x;
127+
writer.x += 5; // Max varint size
128+
129+
// Write the string and get actual byte count
130+
const bytesWritten = writer.utf8(str);
131+
const endPos = writer.x;
132+
133+
// Go back to encode the actual length
134+
writer.x = lengthOffset;
135+
this.writeVarIntUnsigned(bytesWritten);
136+
const actualLengthSize = writer.x - lengthOffset;
137+
138+
// If we reserved more space than needed, shift the string data
139+
if (actualLengthSize < 5) {
140+
const stringStart = lengthOffset + 5;
141+
const stringData = writer.uint8.slice(stringStart, endPos);
142+
writer.x = lengthOffset + actualLengthSize;
143+
writer.buf(stringData, stringData.length);
144+
} else {
145+
writer.x = endPos;
146+
}
147+
}
148+
149+
/**
150+
* Writes an Avro array with length-prefixed encoding.
151+
*/
152+
public writeArr(arr: unknown[]): void {
153+
this.writeVarIntUnsigned(arr.length);
154+
const length = arr.length;
155+
for (let i = 0; i < length; i++) {
156+
this.writeAny(arr[i]);
157+
}
158+
this.writeVarIntUnsigned(0); // End of array marker
159+
}
160+
161+
/**
162+
* Writes an Avro map with length-prefixed encoding.
163+
*/
164+
public writeObj(obj: Record<string, unknown>): void {
165+
const entries = Object.entries(obj);
166+
const length = entries.length;
167+
this.writeVarIntUnsigned(length);
168+
for (let i = 0; i < length; i++) {
169+
const entry = entries[i];
170+
this.writeStr(entry[0]);
171+
this.writeAny(entry[1]);
172+
}
173+
this.writeVarIntUnsigned(0); // End of map marker
174+
}
175+
176+
// BinaryJsonEncoder interface methods
177+
178+
/**
179+
* Generic number writing - determines type based on value
180+
*/
181+
public writeNumber(num: number): void {
182+
if (Number.isInteger(num)) {
183+
if (num >= -2147483648 && num <= 2147483647) {
184+
this.writeInt(num);
185+
} else {
186+
this.writeLong(num);
187+
}
188+
} else {
189+
this.writeDouble(num);
190+
}
191+
}
192+
193+
/**
194+
* Writes an integer value
195+
*/
196+
public writeInteger(int: number): void {
197+
this.writeInt(int);
198+
}
199+
200+
/**
201+
* Writes an unsigned integer value
202+
*/
203+
public writeUInteger(uint: number): void {
204+
this.writeInt(uint);
205+
}
206+
207+
/**
208+
* Writes a float value (interface method)
209+
*/
210+
public writeFloat(float: number): void {
211+
this.writeFloatValue(float);
212+
}
213+
214+
/**
215+
* Writes a float value using IEEE 754 single-precision.
216+
*/
217+
private writeFloatValue(float: number): void {
218+
const writer = this.writer;
219+
writer.ensureCapacity(4);
220+
writer.view.setFloat32(writer.x, float, true); // little-endian
221+
writer.move(4);
222+
}
223+
224+
/**
225+
* Writes an ASCII string (same as regular string in Avro)
226+
*/
227+
public writeAsciiStr(str: string): void {
228+
const writer = this.writer;
229+
this.writeVarIntUnsigned(str.length);
230+
writer.ascii(str);
231+
}
232+
233+
// Utility methods for Avro encoding
234+
235+
/**
236+
* Encodes a variable-length integer (for signed values with zigzag)
237+
*/
238+
private writeVarIntSigned(value: number): void {
239+
const writer = this.writer;
240+
let n = value >>> 0; // Convert to unsigned 32-bit
241+
while (n >= 0x80) {
242+
writer.u8((n & 0x7f) | 0x80);
243+
n >>>= 7;
244+
}
245+
writer.u8(n & 0x7f);
246+
}
247+
248+
/**
249+
* Encodes a variable-length integer (for unsigned values like lengths)
250+
*/
251+
private writeVarIntUnsigned(value: number): void {
252+
const writer = this.writer;
253+
let n = value >>> 0; // Convert to unsigned 32-bit
254+
while (n >= 0x80) {
255+
writer.u8((n & 0x7f) | 0x80);
256+
n >>>= 7;
257+
}
258+
writer.u8(n & 0x7f);
259+
}
260+
261+
/**
262+
* Encodes a variable-length long using Avro's encoding
263+
*/
264+
private writeVarLong(value: bigint): void {
265+
const writer = this.writer;
266+
let n = value;
267+
const mask = BigInt(0x7f);
268+
const shift = BigInt(7);
269+
270+
while (n >= BigInt(0x80)) {
271+
writer.u8(Number((n & mask) | BigInt(0x80)));
272+
n >>= shift;
273+
}
274+
writer.u8(Number(n & mask));
275+
}
276+
277+
/**
278+
* Encodes a 32-bit integer using zigzag encoding
279+
*/
280+
private encodeZigZag32(value: number): number {
281+
return (value << 1) ^ (value >> 31);
282+
}
283+
284+
/**
285+
* Encodes a 64-bit integer using zigzag encoding
286+
*/
287+
private encodeZigZag64(value: bigint): bigint {
288+
return (value << BigInt(1)) ^ (value >> BigInt(63));
289+
}
290+
}

0 commit comments

Comments
 (0)