Skip to content

Commit

Permalink
Merge branch 'dev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli authored Jan 6, 2025
2 parents 2e1bf88 + ad9abc3 commit 8a0aa7a
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 8 deletions.
14 changes: 8 additions & 6 deletions magic_pdf/data/data_reader_writer/multi_bucket_s3.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os

from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
from magic_pdf.data.io.s3 import S3Reader, S3Writer
Expand All @@ -22,10 +22,10 @@ def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
"""
if len(default_prefix) == 0:
raise InvalidConfig('default_prefix must be provided')
arr = default_prefix.strip("/").split("/")

arr = default_prefix.strip('/').split('/')
self.default_bucket = arr[0]
self.default_prefix = "/".join(arr[1:])
self.default_prefix = '/'.join(arr[1:])

found_default_bucket_config = False
for conf in s3_configs:
Expand Down Expand Up @@ -103,7 +103,8 @@ def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
s3_reader = self.__get_s3_client(bucket_name)
else:
s3_reader = self.__get_s3_client(self.default_bucket)
path = os.path.join(self.default_prefix, path)
if self.default_prefix:
path = self.default_prefix + '/' + path
return s3_reader.read_at(path, offset, limit)


Expand Down Expand Up @@ -139,5 +140,6 @@ def write(self, path: str, data: bytes) -> None:
s3_writer = self.__get_s3_client(bucket_name)
else:
s3_writer = self.__get_s3_client(self.default_bucket)
path = os.path.join(self.default_prefix, path)
if self.default_prefix:
path = self.default_prefix + '/' + path
return s3_writer.write(path, data)
3 changes: 2 additions & 1 deletion magic_pdf/pdf_parse_union_core_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def chars_to_content(span):

content = ''
for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格

# 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
char1 = char
char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
Expand Down
2 changes: 1 addition & 1 deletion projects/web_api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.operators import InferenceResult
from magic_pdf.operators.models import InferenceResult

model_config.__use_inside_model__ = True

Expand Down

0 comments on commit 8a0aa7a

Please sign in to comment.