This repository was archived by the owner on Jun 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathpydicom_series.py
698 lines (568 loc) · 21.6 KB
/
pydicom_series.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
# pydicom_series.py
"""
By calling the function read_files with a directory name or list
of files as an argument, a list of DicomSeries instances can be
obtained. A DicomSeries object has some attributes that give
information about the series (such as shape, spacing, suid) and
has an info attribute, which is a pydicom.DataSet instance containing
information about the first dicom file in the series. The data can
be obtained using the get_pixel_array() method, which produces a
3D numpy array if there a multiple files in the series.
This module can deal with gated data, in which case a DicomSeries
instance is created for each 3D volume.
"""
#
# Copyright (c) 2010 Almar Klein
# This file is released under the pydicom license.
# See the file LICENSE included with the pydicom distribution, also
# available at https://github.com/pydicom/pydicom
#
# I (Almar) performed some test to loading a series of data
# in two different ways: loading all data, and deferring loading
# the data. Both ways seem equally fast on my system. I have to
# note that results can differ quite a lot depending on the system,
# but still I think this suggests that deferred reading is in
# general not slower. I think deferred loading of the pixel data
# can be advantageous because maybe not all data of all series
# is needed. Also it simply saves memory, because the data is
# removed from the Dataset instances.
# In the few result below, cold means reading for the first time,
# warm means reading 2nd/3d/etc time.
# - Full loading of data, cold: 9 sec
# - Full loading of data, warm: 3 sec
# - Deferred loading of data, cold: 9 sec
# - Deferred loading of data, warm: 3 sec
import gc
import os
import time
import pydicom
from pydicom.sequence import Sequence
# Try importing numpy
try:
import numpy as np
have_numpy = True
except ImportError:
np = None # NOQA
have_numpy = False
# Helper functions and classes
class ProgressBar(object):
""" To print progress to the screen.
"""
def __init__(self, char='-', length=20):
self.char = char
self.length = length
self.progress = 0.0
self.nbits = 0
self.what = ''
def Start(self, what=''):
""" Start(what='')
Start the progress bar, displaying the given text first.
Make sure not to print anything untill after calling
Finish(). Messages can be printed while displaying
progess by using printMessage().
"""
self.what = what
self.progress = 0.0
self.nbits = 0
sys.stdout.write(what + " [")
def Stop(self, message=""):
""" Stop the progress bar where it is now.
Optionally print a message behind it."""
delta = int(self.length - self.nbits)
sys.stdout.write(" " * delta + "] " + message + "\n")
def Finish(self, message=""):
""" Finish the progress bar, setting it to 100% if it
was not already. Optionally print a message behind the bar.
"""
delta = int(self.length - self.nbits)
sys.stdout.write(self.char * delta + "] " + message + "\n")
def Update(self, newProgress):
""" Update progress. Progress is given as a number
between 0 and 1.
"""
self.progress = newProgress
required = self.length * (newProgress)
delta = int(required - self.nbits)
if delta > 0:
sys.stdout.write(self.char * delta)
self.nbits += delta
def PrintMessage(self, message):
""" Print a message (for example a warning).
The message is printed behind the progress bar,
and a new bar is started.
"""
self.Stop(message)
self.Start(self.what)
def _dummyProgressCallback(progress):
""" A callback to indicate progress that does nothing. """
pass
_progressBar = ProgressBar()
def _progressCallback(progress):
""" The default callback for displaying progress. """
if isinstance(progress, str):
_progressBar.Start(progress)
_progressBar._t0 = time.time()
elif progress is None:
dt = time.time() - _progressBar._t0
_progressBar.Finish(f'{dt:2.2f} seconds')
else:
_progressBar.Update(progress)
def _listFiles(files, path):
"""List all files in the directory, recursively. """
for item in os.listdir(path):
item = os.path.join(path, item)
if os.path.isdir(item):
_listFiles(files, item)
else:
files.append(item)
def _splitSerieIfRequired(serie, series):
""" _splitSerieIfRequired(serie, series)
Split the serie in multiple series if this is required.
The choice is based on examing the image position relative to
the previous image. If it differs too much, it is assumed
that there is a new dataset. This can happen for example in
unspitted gated CT data.
"""
# Sort the original list and get local name
serie._sort()
L = serie._datasets
# Init previous slice
ds1 = L[0]
# Check whether we can do this
if "ImagePositionPatient" not in ds1:
return
# Initialize a list of new lists
L2 = [[ds1]]
# Init slice distance estimate
distance = 0
for index in range(1, len(L)):
# Get current slice
ds2 = L[index]
# Get positions
pos1 = float(ds1.ImagePositionPatient[2])
pos2 = float(ds2.ImagePositionPatient[2])
# Get distances
newDist = abs(pos1 - pos2)
# deltaDist = abs(firstPos-pos2)
# If the distance deviates more than 2x from what we've seen,
# we can agree it's a new dataset.
if distance and newDist > 2.1 * distance:
L2.append([])
distance = 0
else:
# Test missing file
if distance and newDist > 1.5 * distance:
print(f'Warning: missing file after "{ds1.filename}"')
distance = newDist
# Add to last list
L2[-1].append(ds2)
# Store previous
ds1 = ds2
# Split if we should
if len(L2) > 1:
# At what position are we now?
i = series.index(serie)
# Create new series
series2insert = []
for L in L2:
newSerie = DicomSeries(serie.suid, serie._showProgress)
newSerie._datasets = Sequence(L)
series2insert.append(newSerie)
# Insert series and remove self
for newSerie in reversed(series2insert):
series.insert(i, newSerie)
series.remove(serie)
def _getPixelDataFromDataset(ds):
""" Get the pixel data from the given dataset. If the data
was deferred, make it deferred again, so that memory is
preserved. Also applies RescaleSlope and RescaleIntercept
if available. """
# Get original element
el = ds['PixelData']
# Get data
data = ds.pixel_array
# Remove data (mark as deferred)
ds['PixelData'] = el
del ds._pixel_array
# Obtain slope and offset
slope = 1
offset = 0
needFloats = False
needApplySlopeOffset = False
if 'RescaleSlope' in ds:
needApplySlopeOffset = True
slope = ds.RescaleSlope
if 'RescaleIntercept' in ds:
needApplySlopeOffset = True
offset = ds.RescaleIntercept
if int(slope) != slope or int(offset) != offset:
needFloats = True
if not needFloats:
slope, offset = int(slope), int(offset)
# Apply slope and offset
if needApplySlopeOffset:
# Maybe we need to change the datatype?
if data.dtype in [np.float32, np.float64]:
pass
elif needFloats:
data = data.astype(np.float32)
else:
# Determine required range
minReq, maxReq = data.min(), data.max()
minReq = min(
[minReq, minReq * slope + offset, maxReq * slope + offset])
maxReq = max(
[maxReq, minReq * slope + offset, maxReq * slope + offset])
# Determine required datatype from that
dtype = None
if minReq < 0:
# Signed integer type
maxReq = max([-minReq, maxReq])
if maxReq < 2 ** 7:
dtype = np.int8
elif maxReq < 2 ** 15:
dtype = np.int16
elif maxReq < 2 ** 31:
dtype = np.int32
else:
dtype = np.float32
else:
# Unsigned integer type
if maxReq < 2 ** 8:
dtype = np.uint8
elif maxReq < 2 ** 16:
dtype = np.uint16
elif maxReq < 2 ** 32:
dtype = np.uint32
else:
dtype = np.float32
# Change datatype
if dtype != data.dtype:
data = data.astype(dtype)
# Apply slope and offset
data *= slope
data += offset
# Done
return data
# The public functions and classes
def find_shape(dataset):
"""Find the expected shape of `dataset.pixel_array` without reading the pixel data.
The returned shape is a tuple"""
shape = dataset.Rows, dataset.Columns
frames = dataset.get('NumberOfFrames', 1) or 1
if frames > 1:
shape = (frames,) + shape
samples = dataset.SamplesPerPixel
if samples > 1:
conf = dataset.PlanarConfiguration
if conf == 0:
shape += (samples,)
elif conf == 1:
shape = (samples,) + shape
else:
raise ValueError(f"Invalid Planar Configuration: '{conf}'")
return shape
def read_files(path, showProgress=False, readPixelData=False, force=False):
""" read_files(path, showProgress=False, readPixelData=False)
Reads dicom files and returns a list of DicomSeries objects, which
contain information about the data, and can be used to load the
image or volume data.
The parameter "path" can also be a list of files or directories.
If the callable "showProgress" is given, it is called with a single
argument to indicate the progress. The argument is a string when a
progress is started (indicating what is processed). A float indicates
progress updates. The paremeter is None when the progress is finished.
When "showProgress" is True, a default callback is used that writes
to stdout. By default, no progress is shown.
if readPixelData is True, the pixel data of all series is read. By
default the loading of pixeldata is deferred until it is requested
using the DicomSeries.get_pixel_array() method. In general, both
methods should be equally fast.
"""
# Init list of files
files = []
# Obtain data from the given path
if isinstance(path, str):
# Make dir nice
basedir = os.path.abspath(path)
# Check whether it exists
if not os.path.isdir(basedir):
raise ValueError('The given path is not a valid directory.')
# Find files recursively
_listFiles(files, basedir)
elif isinstance(path, (tuple, list)):
# Iterate over all elements, which can be files or directories
for p in path:
if os.path.isdir(p):
_listFiles(files, os.path.abspath(p))
elif os.path.isfile(p):
files.append(p)
else:
print(f"Warning, the path '{p}' is not valid.")
else:
raise ValueError('The path argument must be a string or list.')
# Set default progress callback?
if showProgress is True:
showProgress = _progressCallback
if not hasattr(showProgress, '__call__'):
showProgress = _dummyProgressCallback
# Set defer size
deferSize = 16383 # 128**2-1
if readPixelData:
deferSize = None
# Gather file data and put in DicomSeries
series = {}
count = 0
showProgress('Loading series information:')
for filename in files:
# Skip DICOMDIR files
if filename.count("DICOMDIR"):
continue
# Try loading dicom ...
try:
dcm = pydicom.dcmread(filename, deferSize, force=force)
except pydicom.filereader.InvalidDicomError:
continue # skip non-dicom file
except Exception as why:
if showProgress is _progressCallback:
_progressBar.PrintMessage(str(why))
else:
print('Warning:', why)
continue
# Get SUID and register the file with an existing or new series object
try:
suid = dcm.SeriesInstanceUID
except AttributeError:
continue # some other kind of dicom file
if suid not in series:
series[suid] = DicomSeries(suid, showProgress)
series[suid]._append(dcm)
# Show progress (note that we always start with a 0.0)
showProgress(float(count) / len(files))
count += 1
# Finish progress
showProgress(None)
# Make a list and sort, so that the order is deterministic
series = list(series.values())
series.sort(key=lambda x: x.suid)
# Split series if necessary
for serie in reversed([serie for serie in series]):
_splitSerieIfRequired(serie, series)
# Finish all series
showProgress('Analysing series')
series_ = []
for i in range(len(series)):
try:
series[i]._finish()
series_.append(series[i])
except Exception:
pass # Skip serie (probably report-like file without pixels)
showProgress(float(i + 1) / len(series))
showProgress(None)
return series_
class DicomSeries(object):
""" DicomSeries
This class represents a series of dicom files that belong together.
If these are multiple files, they represent the slices of a volume
(like for CT or MRI). The actual volume can be obtained using loadData().
Information about the data can be obtained using the info attribute.
"""
# To create a DicomSeries object, start by making an instance and
# append files using the "_append" method. When all files are
# added, call "_sort" to sort the files, and then "_finish" to evaluate
# the data, perform some checks, and set the shape and spacing
# attributes of the instance.
def __init__(self, suid, showProgress):
# Init dataset list and the callback
self._datasets = Sequence()
self._showProgress = showProgress
# Init props
self._suid = suid
self._info = None
self._shape = None
self._spacing = None
@property
def suid(self):
""" The Series Instance UID. """
return self._suid
@property
def shape(self):
""" The shape of the data (nz, ny, nx).
If None, the series contains a single dicom file. """
return self._shape
@property
def spacing(self):
""" The spacing (voxel distances) of the data (dz, dy, dx).
If None, the series contains a single dicom file. """
return self._spacing
@property
def info(self):
""" A DataSet instance containing the information as present in the
first dicomfile of this series. """
return self._info
@property
def description(self):
""" A description of the dicom series. Used fields are
PatientName, shape of the data, SeriesDescription,
and ImageComments.
"""
info = self.info
# If no info available, return simple description
if info is None:
return "DicomSeries containing %i images" % len(self._datasets)
fields = []
# Give patient name
if 'PatientName' in info:
fields.append(f"{info.PatientName}")
# Also add dimensions
if self.shape:
tmp = [str(d) for d in self.shape]
fields.append('x'.join(tmp))
# Try adding more fields
if 'SeriesDescription' in info:
fields.append("'" + info.SeriesDescription + "'")
if 'ImageComments' in info:
fields.append("'" + info.ImageComments + "'")
# Combine
return ' '.join(fields)
def __repr__(self):
adr = hex(id(self)).upper()
data_len = len(self._datasets)
return "<DicomSeries with %i images at %s>" % (data_len, adr)
def get_pixel_array(self):
""" get_pixel_array()
Get (load) the data that this DicomSeries represents, and return
it as a numpy array. If this serie contains multiple images, the
resulting array is 3D, otherwise it's 2D.
If RescaleSlope and RescaleIntercept are present in the dicom info,
the data is rescaled using these parameters. The data type is chosen
depending on the range of the (rescaled) data.
"""
# Can we do this?
if not have_numpy:
msg = "The Numpy package is required to use get_pixel_array.\n"
raise ImportError(msg)
# It's easy if no file or if just a single file
if len(self._datasets) == 0:
raise ValueError('Serie does not contain any files.')
elif len(self._datasets) == 1:
ds = self._datasets[0]
slice = _getPixelDataFromDataset(ds)
return slice
# Check info
if self.info is None:
raise RuntimeError("Cannot return volume if series not finished.")
# Set callback to update progress
showProgress = self._showProgress
# Init data (using what the dicom packaged produces as a reference)
ds = self._datasets[0]
slice = _getPixelDataFromDataset(ds)
vol = np.zeros(self.shape, dtype=slice.dtype)
vol[0] = slice
# Fill volume
showProgress('Loading data:')
ll = self.shape[0]
for z in range(1, ll):
ds = self._datasets[z]
vol[z] = _getPixelDataFromDataset(ds)
showProgress(float(z) / ll)
# Finish
showProgress(None)
# Done
gc.collect()
return vol
def _append(self, dcm):
""" _append(dcm)
Append a dicomfile (as a pydicom.dataset.FileDataset) to the series.
"""
self._datasets.append(dcm)
def _sort(self):
""" sort()
Sort the datasets by instance number.
"""
self._datasets.sort(key=lambda k: k.InstanceNumber)
def _finish(self):
""" _finish()
Evaluate the series of dicom files. Together they should make up
a volumetric dataset. This means the files should meet certain
conditions. Also some additional information has to be calculated,
such as the distance between the slices. This method sets the
attributes for "shape", "spacing" and "info".
This method checks:
* that there are no missing files
* that the dimensions of all images match
* that the pixel spacing of all images match
"""
# The datasets list should be sorted by instance number
L = self._datasets
if len(L) == 0:
return
elif len(L) < 2:
# Set attributes
ds = self._datasets[0]
self._info = self._datasets[0]
self._shape = find_shape(ds)
self._spacing = ds.PixelSpacing
return
# Get previous
ds1 = L[0]
# Init measures to calculate average of
distance_sum = 0.0
# Init measures to check (these are in 2D)
dimensions = find_shape(ds1)
# row, column
spacing = ds1.PixelSpacing
for index in range(len(L)):
# The first round ds1 and ds2 will be the same, for the
# distance calculation this does not matter
# Get current
ds2 = L[index]
# Get positions
pos1 = float(ds1.ImagePositionPatient[2])
pos2 = float(ds2.ImagePositionPatient[2])
# Update distance_sum to calculate distance later
# TODO: use ImageOrientationPatient's normal to calculate this distance
distance_sum += abs(pos1 - pos2)
# Test measures
dimensions2 = find_shape(ds2)
spacing2 = ds2.PixelSpacing
if dimensions != dimensions2:
# We cannot produce a volume if the dimensions match
raise ValueError('Dimensions of slices does not match.')
if spacing != spacing2:
# We can still produce a volume, but we should notify the user
msg = 'Warning: spacing does not match.'
if self._showProgress is _progressCallback:
_progressBar.PrintMessage(msg)
else:
print(msg)
# Store previous
ds1 = ds2
# Create new dataset by making a deep copy of the first
info = pydicom.dataset.Dataset()
firstDs = self._datasets[0]
for key in firstDs.keys():
if key != 'PixelData':
el = firstDs[key]
info.add_new(el.tag, el.VR, el.value)
# Finish calculating average distance
# (Note that there are len(L)-1 distances)
distance_mean = distance_sum / (len(L) - 1)
# Store information that is specific for the series
self._shape = (len(L),) + dimensions
spacing.insert(0, distance_mean)
self._spacing = spacing
# Store
self._info = info
if __name__ == '__main__':
import sys
if len(sys.argv) != 2:
print("Expected a single argument: a directory with dicom files in it")
else:
adir = sys.argv[1]
t0 = time.time()
all_series = read_files(adir, False, False)
print("Summary of each series:")
for series in all_series:
print(series.description)
arr = series.get_pixel_array()