Extracted image has visibly worse resolution #3597
              
                Unanswered
              
          
                  
                    
                      aleblanc30
                    
                  
                
                  asked this question in
                Looking for help
              
            Replies: 1 comment
-
| Hi again, I may have not provided enough information in my initial post, so let me update it with my script and the pdf I am working with. If someone wants to engage with this question but lacks information, please, ask me to expand, I'll gladly do so as I need the help :-) import pymupdf
import os
import matplotlib.pyplot as plt
from PIL import Image, ImageTk
from pprint import pprint
from math import sqrt
import numpy as np
from copy import deepcopy
dir = os.path.dirname(__file__)
fn = os.path.join(dir, 'to_extract.pdf')
doc = pymupdf.open(fn)
page = doc.load_page(0)
# extract scale ticks
ticks = []
for drawing in page.get_drawings():
    items = drawing['items']
    if len(items) == 1:
        item = items[0]
        if len(item) == 3 and item[0]=='l' and item[1].y ==item[2].y:
            p1, p2 = item[1], item[2]
            norm = sqrt((p1.x-p2.x)**2+(p1.y-p2.y)**2)
            if norm < .65:
                ticks.append(drawing)
# extract rasters, colorbars and scale labels
scale_bars = []
labels = []
rasters = []
others = []
page.clean_contents()
for img in page.get_images(full=True):
    xref = img[0]
    smask = img[1]
    try:
        bbox, transform=page.get_image_bbox(img, transform=True)
    except:
        continue
    img = doc.extract_image(xref)
    img['xref'] = xref
    img['smask'] = smask
    img['bbox'] = bbox
    img['transform'] = transform
    w, h = img['width'], img['height']
    if h > 120:
        if w < 100:
            scale_bars.append(img)
        elif w > 160:
            rasters.append(img)
        else:
            others.append(img)
    elif w<=10 and h <=20:
        x, y = bbox.x1, .5*(bbox.y1+bbox.y0)
        dmin, imin = 1e10,-1
        for i, t in enumerate(ticks):
            p1, p2 = t['items'][0][1:]
            d = sqrt((p1.x-x)**2 + (p1.y-y)**2)
            if d <dmin:
                dmin, imin = d, i
        if dmin < 3:
            ticks[imin]['label_img'] = img
            labels.append(img)
    else:
        others.append(img)
# run ocr on ticks
for tick in ticks:
    if not('label_img' in tick):
        continue
    img = tick['label_img']
    pix = pymupdf.Pixmap(doc.extract_image(img['xref'])['image'])
    mode = "RGBA" if pix.alpha else "RGB"
    img_ = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)
    samples = bytearray(img_.tobytes())    # get plain pixel data from numpy array
    pix = pymupdf.Pixmap(pymupdf.csRGB, img_.shape[1], img_.shape[0], samples, False)
 
    temp_doc = pymupdf.open("pdf", pix.pdfocr_tobytes())
    text = temp_doc[0].get_text()
    if text != '':
        tick['label'] = max(0, min(int(text.strip(' \n').replace('O', '0')), 3))
# attribute ticks to scale_bars
for scale_bar in scale_bars:
    scale_bar['ticks'] = []
    scale_bar['ticklabels'] = []
    for tick in ticks:
        ymin, ymax = scale_bar['bbox'].y0, scale_bar['bbox'].y1
        ymin, ymax = min(ymin, ymax), max(ymin, ymax)
        scale_x = scale_bar['bbox'].x1
        p2 = tick['items'][0][2]
        d2 = (scale_x-p2.x)**2
        if p2.y < ymin or p2.y > ymax:
            d2 += max((p2.y - ymin)**2, (p2.y - ymax)**2)
        d = sqrt(d2)
        if d < 10:
            p1 = tick['items'][0][1]
            scale_bar['ticks'].append((ymax - p1.y)/(ymax-ymin))
            scale_bar['ticklabels'].append(tick.get('label', None))
        
class Graph:
    def __init__(self, raster):
        self.raster = raster
        self.x = self.raster['bbox'].x1
        self.y = .5*(self.raster['bbox'].y0+self.raster['bbox'].y1)
graphs = [Graph(r) for r in rasters]
graphs = sorted(graphs, key=lambda g: g.y)
graphs = [graphs[:4], graphs[4:8], graphs[8:12], graphs[12:16], graphs[16:20], graphs[20:]]
graphs = [sorted(gr, key=lambda g:g.x) for gr in graphs]
# attribute scales to each graph
graphs_with_scales = []
for scale_bar in scale_bars:
    x, y = scale_bar['bbox'].x0, .5*(scale_bar['bbox'].y0+scale_bar['bbox'].y1)
    imin, jmin, dmin = 1000, 1000, 1e5
    for i, row in enumerate(graphs):
        for j, g in enumerate(row):
            d = sqrt((x-g.x)**2+(y-g.y)**2)
            if d < dmin:
                dmin = d
                imin = i
                jmin = j
    graphs[imin][jmin].scale_bar = scale_bar
# handle misplaced scale bars : attribute bars and update bboxes
missing_correct = {(2,0):(2,1),
                   (0,2):(1,3),
                   (1,2):(1,3),
                   (0,3):(1,3),
                   (4,0):(3,0),
                   (4,1):(3,1),
                   (5,1):(3,1)}
for missing, correct in missing_correct.items():
    i, j = missing
    g = graphs[i][j]
    g.scale_bar = deepcopy(graphs[correct[0]][correct[1]].scale_bar)
    bbox = g.scale_bar['bbox']
    w, h = bbox.x1-bbox.x0, bbox.y1-bbox.y0
    x0 = g.raster['bbox'].x1+ 1
    y0 = g.raster['bbox'].y0
    bbox.x0 = x0
    bbox.y0 = y0
    bbox.x1 = x0 + w
    bbox.y1 = y0 + h
# check consistency of OCR ticks and attribute labels to other ticks
for i, r in enumerate(graphs):
    for j, g in enumerate(r):
        sb = g.scale_bar
        ticks = [[t, tl] for t, tl in zip(sb['ticks'], sb['ticklabels'])]
        ticks = sorted(ticks, key=lambda t: t[0])
        for l in range(5):
            for k in range(1, len(ticks)):
                if ticks[k][1] is None and ticks[k-1][1] is not None:
                    ticks[k][1] = ticks[k-1][1]+1
            for k in range(len(ticks)-1):
                if ticks[k][1] is None and ticks[k+1][1] is not None:
                    ticks[k][1] = ticks[k+1][1]-1
        sb['ticks'] = [t[0] for t in ticks]
        sb['ticklabels'] = [t[1] for t in ticks]
        if None in sb['ticklabels']:
            sb['ticklabels'] = [-1, 0, 1, 2, 3]
# create plot to ascertain that everything is in order
plot = False
if plot:
    import matplotlib.patches as patches
    plt.figure()
    ax = plt.axes()
    ax.set_xlim(200,600)
    ax.set_ylim(50,500)
    ax.set_xlim(-100,1000)
    ax.set_ylim(-100,1000)
    ax.invert_yaxis()
    c = ['r', 'g', 'b', 'c', 'y', 'm']
    rects = []
    from random import random
    for i, r in enumerate(graphs):
        for j, g in enumerate(r):
            bbox=g.raster['bbox']
            x, y = bbox.x1, .5*(bbox.y0+bbox.y1)
            # ax.plot(x, y, 'ok')
            rect = patches.Rectangle((bbox.x0, bbox.y0), bbox.x1-bbox.x0, bbox.y1-bbox.y0, fc=c[(i+j)%6], fill=True, visible=True)
            ax.add_artist(rect)
            bbox=g.scale_bar['bbox']
            x, y = bbox.x0, .5*(bbox.y0+bbox.y1)
            # ax.plot(x, y, 'or')
            x0, y0 = (bbox.x0, bbox.y0)
            w, h = bbox.x1-bbox.x0, bbox.y1-bbox.y0
            pix = pymupdf.Pixmap(doc.extract_image(g.scale_bar['xref'])['image'])
            mask = pymupdf.Pixmap(doc.extract_image(g.scale_bar['smask'])['image'])
            pix = pymupdf.Pixmap(pix, mask)
            mode = "RGBA" if pix.alpha else "RGB"
            img_ = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
            ax.imshow(img_, extent=[x0, x0+w, y0, y0+h], origin='lower')
            # rect = patches.Rectangle((x0,y0), w, h, fc=c[(i+j)%6], fill=True, visible=True)
            ax.add_artist(rect)
            xticks, yticks = [],[]
            r = random()
            for tick, lbl in zip(g.scale_bar['ticks'], g.scale_bar['ticklabels']):
                x, y = x0+.5*w, bbox.y1-h*tick
                ax.text(x, y, lbl)
                xticks.append(x)
                yticks.append(y)
            plt.plot(xticks, yticks, 'ok')
    plt.show()
# extract timeseries
from scipy.interpolate import interp1d, RegularGridInterpolator
for i, r in enumerate(graphs):
    for j, g in enumerate(r):
        pix = pymupdf.Pixmap(doc.extract_image(g.scale_bar['xref'])['image'])
        mode = "RGB"
        scale = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)
        scale = scale[::-1,5,:]
        print(g.raster)
        pix = pymupdf.Pixmap(doc.extract_image(g.raster['xref'])['image'])
        mode = "RGB"
        # raster = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)
        import io
        raster = np.array(Image.open(io.BytesIO(g.raster['image'])), dtype=np.uint8)
        print(raster.shape)
        red = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,0])
        green = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,1])
        blue = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,2])
        ni = 19
        si = np.linspace(0,1,ni)
        si = .022*si+.97*(1-si)
        nj = 45
        sj = np.linspace(0,1,nj)
        sj = .01*sj+.985*(1-sj)
        _, axs = plt.subplots(1,2)
        axs[0].imshow(raster, extent=[0,1,0,1])
        raster = np.empty((ni, nj, 3), dtype=np.uint8)
        i, j = np.meshgrid(si[::-1], sj[::-1], indexing='ij')
        axs[0].plot(j.flatten(), i.flatten(), 'or')
        print(i.shape, j.shape, raster.shape)
        raster[:,:,0] = red((i,j))
        raster[:,:,1] = green((i,j))
        raster[:,:,2] = blue((i,j))
        axs[1].imshow(raster, extent=[0,1,0,1])
        plt.show() | 
Beta Was this translation helpful? Give feedback.
                  
                    0 replies
                  
                
            
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
        
    
Uh oh!
There was an error while loading. Please reload this page.
-
Hi ! I'm looking to extract image data from a pdf.
I wrote a script that goes like this :
however, the image I get is visibly of lower resolution than the one in the pdf. I know this because the image is made of rectangles, which are sharp in the pdf but are blurred in the extracted image.
Most questions asked online about resolution are about the resolution of the image of a whole page, obtained with
page.get_pixmap, and are solved with thedpiortransformarguments. This does not seem to apply to my issue.Any help would be appreciated.
Beta Was this translation helpful? Give feedback.
All reactions