@@ -3169,10 +3169,19 @@ PyObject *JM_fitz_config()
3169
3169
//----------------------------------------------------------------------------
3170
3170
PyObject *JM_BinFromBuffer(fz_context *ctx, fz_buffer *buffer)
3171
3171
{
3172
- if (!buffer) return NULL;
3172
+ PyObject *bytes = NULL;
3173
3173
char *c = NULL;
3174
- size_t len = fz_buffer_storage(gctx, buffer, &c);
3175
- return PyBytes_FromStringAndSize(c, (Py_ssize_t) len);
3174
+ if (buffer)
3175
+ {
3176
+ size_t len = fz_buffer_storage(gctx, buffer, &c);
3177
+ bytes = PyBytes_FromStringAndSize(c, (Py_ssize_t) len);
3178
+ }
3179
+ else
3180
+ {
3181
+ bytes = PyBytes_FromString("");
3182
+ }
3183
+ Py_INCREF(bytes);
3184
+ return bytes;
3176
3185
}
3177
3186
3178
3187
//----------------------------------------------------------------------------
@@ -3296,7 +3305,7 @@ void hexlify(int n, unsigned char *in, unsigned char *out)
3296
3305
}
3297
3306
3298
3307
//----------------------------------------------------------------------------
3299
- // Turn Python a bytes or bytearray object into char* string
3308
+ // Turn a bytes or bytearray object into char* string
3300
3309
// using the "_AsString" functions. Returns string size or 0 on error.
3301
3310
//----------------------------------------------------------------------------
3302
3311
size_t JM_CharFromBytesOrArray(PyObject *stream, char **data)
@@ -3317,6 +3326,31 @@ size_t JM_CharFromBytesOrArray(PyObject *stream, char **data)
3317
3326
return len;
3318
3327
}
3319
3328
3329
+ //----------------------------------------------------------------------------
3330
+ // Return fz_buffer from a PyBytes or PyByteArray object
3331
+ //----------------------------------------------------------------------------
3332
+ fz_buffer *JM_BufferFromBytes(fz_context *ctx, PyObject *stream)
3333
+ {
3334
+ if (!stream) return NULL;
3335
+ char *c = NULL;
3336
+ size_t len = JM_CharFromBytesOrArray(stream, &c);
3337
+ if (!c) return NULL;
3338
+ fz_buffer *res = NULL;
3339
+ fz_var(res);
3340
+ fz_try(ctx)
3341
+ {
3342
+ res = fz_new_buffer(ctx, len);
3343
+ fz_append_data(ctx, res, c, len);
3344
+ fz_terminate_buffer(ctx, res);
3345
+ }
3346
+ fz_catch(ctx)
3347
+ {
3348
+ fz_drop_buffer(ctx, res);
3349
+ fz_rethrow(ctx);
3350
+ }
3351
+ return res;
3352
+ }
3353
+
3320
3354
//----------------------------------------------------------------------------
3321
3355
// Modified copy of SWIG_Python_str_AsChar
3322
3356
// If Py3, the SWIG original v3.0.12does *not* deliver NULL for a
@@ -4169,12 +4203,91 @@ JM_style_begin_dict(fz_context *ctx, PyObject *span, fz_font *font, float size,
4169
4203
void
4170
4204
JM_style_end_dict(fz_context *ctx, fz_buffer *buff, PyObject *span, PyObject *spanlist)
4171
4205
{
4172
- PyDict_SetItemString(span, "text", JM_StrFromBuffer(ctx, buff));
4206
+ if (buff)
4207
+ PyDict_SetItemString(span, "text", JM_StrFromBuffer(ctx, buff));
4173
4208
PyList_Append(spanlist, span);
4174
4209
}
4175
4210
4176
4211
PyObject *
4177
- JM_extract_stext_textblock_as_dict(fz_context *ctx, fz_stext_block *block)
4212
+ JM_extract_stext_textchar_as_dict(fz_context *ctx, fz_stext_char *ch)
4213
+ {
4214
+ PyObject *chardict = NULL;
4215
+
4216
+ chardict = PyDict_New();
4217
+ PyDict_SetItemString(chardict, "c", Py_BuildValue("C", ch->c));
4218
+ PyDict_SetItemString(chardict, "origin", Py_BuildValue("ff", ch->origin.x, ch->origin.y));
4219
+ PyDict_SetItemString(chardict, "bbox", Py_BuildValue("ffff",
4220
+ ch->bbox.x0, ch->bbox.y0,
4221
+ ch->bbox.x1, ch->bbox.y1));
4222
+ return chardict;
4223
+ }
4224
+
4225
+ PyObject *
4226
+ JM_extract_stext_textline_as_dict(fz_context *ctx, fz_stext_line *line)
4227
+ {
4228
+ fz_stext_char *ch;
4229
+ fz_font *font = NULL;
4230
+ fz_buffer *buff = NULL;
4231
+ float size = 0;
4232
+ int sup = 0;
4233
+ PyObject *span=NULL, *spanlist = NULL, *linedict = NULL, *charlist;
4234
+ PyObject *chardict;
4235
+
4236
+ linedict = PyDict_New();
4237
+ fz_rect *linerect = JM_empty_rect();
4238
+ PyDict_SetItemString(linedict, "wmode", Py_BuildValue("i", line->wmode));
4239
+ PyDict_SetItemString(linedict, "dir", Py_BuildValue("ff", line->dir.x, line->dir.y));
4240
+ spanlist = PyList_New(0);
4241
+ font = NULL;
4242
+ size = 0;
4243
+
4244
+ for (ch = line->first_char; ch; ch = ch->next)
4245
+ {
4246
+ JM_join_rect(linerect, &ch->bbox, ch->size);
4247
+
4248
+ int ch_sup = detect_super_script(line, ch);
4249
+ if (ch->font != font || ch->size != size)
4250
+ { // start new span
4251
+ if (font) // must finish old span first
4252
+ {
4253
+ PyDict_SetItemString(span, "chars", charlist);
4254
+ Py_CLEAR(charlist);
4255
+ JM_style_end_dict(ctx, NULL, span, spanlist);
4256
+ Py_CLEAR(span);
4257
+ font = NULL;
4258
+ }
4259
+ font = ch->font;
4260
+ size = ch->size;
4261
+ sup = ch_sup;
4262
+ charlist = PyList_New(0);
4263
+ span = PyDict_New();
4264
+ JM_style_begin_dict(ctx, span, font, size, sup);
4265
+ }
4266
+ chardict = JM_extract_stext_textchar_as_dict(ctx, ch);
4267
+ PyList_Append(charlist, chardict);
4268
+ Py_CLEAR(chardict);
4269
+ }
4270
+ if (font)
4271
+ {
4272
+ PyDict_SetItemString(span, "chars", charlist);
4273
+ Py_CLEAR(charlist);
4274
+ JM_style_end_dict(ctx, NULL, span, spanlist);
4275
+ Py_CLEAR(span);
4276
+ font = NULL;
4277
+ }
4278
+
4279
+ PyDict_SetItemString(linedict, "spans", spanlist);
4280
+ Py_CLEAR(spanlist);
4281
+ PyDict_SetItemString(linedict, "bbox", Py_BuildValue("ffff",
4282
+ linerect->x0, linerect->y0,
4283
+ linerect->x1, linerect->y1));
4284
+
4285
+ free(linerect);
4286
+ return linedict;
4287
+ }
4288
+
4289
+ PyObject *
4290
+ JM_extract_stext_textblock_as_dict(fz_context *ctx, fz_stext_block *block, int rawdict)
4178
4291
{
4179
4292
fz_stext_line *line;
4180
4293
fz_stext_char *ch;
@@ -4190,6 +4303,15 @@ JM_extract_stext_textblock_as_dict(fz_context *ctx, fz_stext_block *block)
4190
4303
4191
4304
for (line = block->u.t.first_line; line; line = line->next)
4192
4305
{
4306
+ if (rawdict != 0)
4307
+ {
4308
+ linedict = JM_extract_stext_textline_as_dict(ctx, line);
4309
+ PyList_Append(linelist, linedict);
4310
+ Py_CLEAR(linedict);
4311
+ JM_join_rect(blockrect, &line->bbox, 0.0f);
4312
+ continue;
4313
+ }
4314
+
4193
4315
linedict = PyDict_New();
4194
4316
fz_rect *linerect = JM_empty_rect();
4195
4317
PyDict_SetItemString(linedict, "wmode", Py_BuildValue("i", line->wmode));
@@ -4226,6 +4348,8 @@ JM_extract_stext_textblock_as_dict(fz_context *ctx, fz_stext_block *block)
4226
4348
{
4227
4349
JM_style_end_dict(ctx, buff, span, spanlist);
4228
4350
Py_CLEAR(span);
4351
+ fz_drop_buffer(ctx, buff);
4352
+ buff = NULL;
4229
4353
font = NULL;
4230
4354
}
4231
4355
@@ -4313,7 +4437,7 @@ JM_extract_stext_imageblock_as_dict(fz_context *ctx, fz_stext_block *block)
4313
4437
}
4314
4438
4315
4439
PyObject *
4316
- JM_stext_page_as_dict(fz_context *ctx, fz_stext_page *page)
4440
+ JM_stext_page_as_dict(fz_context *ctx, fz_stext_page *page, int rawdict )
4317
4441
{
4318
4442
PyObject *dict = PyDict_New();
4319
4443
PyObject *blocklist = PyList_New(0);
@@ -4327,7 +4451,7 @@ JM_stext_page_as_dict(fz_context *ctx, fz_stext_page *page)
4327
4451
if (block->type == FZ_STEXT_BLOCK_IMAGE)
4328
4452
PyList_Append(blocklist, JM_extract_stext_imageblock_as_dict(ctx, block));
4329
4453
else
4330
- PyList_Append(blocklist, JM_extract_stext_textblock_as_dict(ctx, block));
4454
+ PyList_Append(blocklist, JM_extract_stext_textblock_as_dict(ctx, block, rawdict ));
4331
4455
}
4332
4456
PyDict_SetItemString(dict, "blocks", blocklist);
4333
4457
Py_CLEAR(blocklist);
@@ -7155,24 +7279,19 @@ SWIGINTERN PyObject *fz_document_s__updateStream(struct fz_document_s *self,int
7155
7279
fz_var(obj);
7156
7280
fz_buffer *res = NULL;
7157
7281
fz_var(res);
7158
- size_t len = 0;
7159
- char *c = NULL;
7160
7282
pdf_document *pdf = pdf_specifics(gctx, self); // get pdf doc
7161
7283
fz_try(gctx)
7162
7284
{
7163
7285
assert_PDF(pdf);
7164
7286
int xreflen = pdf_xref_len(gctx, pdf);
7165
7287
if (!INRANGE(xref, 1, xreflen-1))
7166
7288
THROWMSG("xref out of range");
7167
- len = JM_CharFromBytesOrArray(stream, &c);
7168
- if (!c) THROWMSG("stream must be bytes or bytearray");
7169
7289
// get the object
7170
7290
obj = pdf_new_indirect(gctx, pdf, xref, 0);
7171
- if (new == 0 && !pdf_is_stream(gctx, obj))
7291
+ if (! new && !pdf_is_stream(gctx, obj))
7172
7292
THROWMSG("xref not a stream object");
7173
- res = fz_new_buffer(gctx, len);
7174
- fz_append_data(gctx, res, c, len);
7175
- fz_terminate_buffer(gctx, res);
7293
+ res = JM_BufferFromBytes(gctx, stream);
7294
+ if (!res) THROWMSG("stream must be bytes or bytearray");
7176
7295
JM_update_stream(gctx, pdf, obj, res);
7177
7296
7178
7297
}
@@ -9984,7 +10103,7 @@ SWIGINTERN PyObject *fz_stext_page_s__extractText(struct fz_stext_page_s *self,i
9984
10103
fz_print_stext_page_as_html(gctx, out, self);
9985
10104
break;
9986
10105
case(2):
9987
- text = JM_stext_page_as_dict(gctx, self);
10106
+ text = JM_stext_page_as_dict(gctx, self, 0 );
9988
10107
break;
9989
10108
case(3):
9990
10109
fz_print_stext_page_as_xml(gctx, out, self);
@@ -9993,7 +10112,10 @@ SWIGINTERN PyObject *fz_stext_page_s__extractText(struct fz_stext_page_s *self,i
9993
10112
fz_print_stext_page_as_xhtml(gctx, out, self);
9994
10113
break;
9995
10114
case(5):
9996
- text = JM_stext_page_as_dict(gctx, self);
10115
+ text = JM_stext_page_as_dict(gctx, self, 0);
10116
+ break;
10117
+ case(6):
10118
+ text = JM_stext_page_as_dict(gctx, self, 1);
9997
10119
break;
9998
10120
default:
9999
10121
JM_print_stext_page_as_text(gctx, out, self);
0 commit comments