diff --git a/Dockerfile b/Dockerfile index 4ca5afb..1fa3932 100644 --- a/Dockerfile +++ b/Dockerfile @@ -134,6 +134,6 @@ WORKDIR $mc_path/multic/cli RUN python -m slicer_cli_web.cli_list_entrypoint --list_cli RUN python -m slicer_cli_web.cli_list_entrypoint MultiCompartmentSegment --help RUN python -m slicer_cli_web.cli_list_entrypoint FeatureExtraction --help - +RUN python -m slicer_cli_web.cli_list_entrypoint ReferenceFeatureExtraction --help ENTRYPOINT ["/bin/bash", "docker-entrypoint.sh"] diff --git a/multic/cli/ReferenceFeatureExtraction/ReferenceFeatureExtraction.py b/multic/cli/ReferenceFeatureExtraction/ReferenceFeatureExtraction.py new file mode 100644 index 0000000..0fb06f8 --- /dev/null +++ b/multic/cli/ReferenceFeatureExtraction/ReferenceFeatureExtraction.py @@ -0,0 +1,57 @@ +import os +import sys +from glob import glob +import girder_client +from ctk_cli import CLIArgumentParser + +sys.path.append("..") +from segmentationschool.utils.json_to_xml import get_xml_path + +NAMES = ['cortical_interstitium','medullary_interstitium','non_globally_sclerotic_glomeruli','globally_sclerotic_glomeruli','tubules','arteries/arterioles'] + + +def main(args): + + folder = args.base_dir + wsi = args.input_file + file_name = wsi.split('/')[-1] + base_dir_id = folder.split('/')[-2] + _ = os.system("printf '\nUsing data from girder_client Folder: {}\n'".format(folder)) + + gc = girder_client.GirderClient(apiUrl=args.girderApiUrl) + gc.setToken(args.girderToken) + # get files in folder + files = list(gc.listItem(base_dir_id)) + # dict to link filename to gc id + item_dict = dict() + for file in files: + d = {file['name']:file['_id']} + item_dict.update(d) + + file_id = item_dict[file_name] + + cwd = os.getcwd() + print(cwd) + os.chdir(cwd) + tmp = folder + _ = os.system("printf '\n---\n\nFOUND: [{}]\n'".format(file_name)) + # get annotation + annotations= gc.get('/annotation/item/{}'.format(file_id), parameters={'sort': 'updated'}) + annotations.reverse() + annotations = list(annotations) + + annotations_filtered = [annot for annot in annotations if annot['annotation']['name'].strip() in NAMES] + _ = os.system("printf '\tfound [{}] annotation layers...\n'".format(len(annotations_filtered))) + del annotations + # create root for xml file + + xml_path = get_xml_path(annotations_filtered, NAMES, tmp, file_name) + _ = os.system("printf '\ndone retriving data...\n\n'") + + cmd = "python3 ../segmentationschool/segmentation_school.py --option {} --base_dir {} --file {} --xml_path {} --platform {} --item_id {} --girderApiUrl {} --girderToken {}".format('get_features', args.base_dir, args.input_file, xml_path, 'DSA',file_id, args.girderApiUrl, args.girderToken) + print(cmd) + sys.stdout.flush() + os.system(cmd) + +if __name__ == "__main__": + main(CLIArgumentParser().parse_args()) diff --git a/multic/cli/ReferenceFeatureExtraction/ReferenceFeatureExtraction.xml b/multic/cli/ReferenceFeatureExtraction/ReferenceFeatureExtraction.xml new file mode 100644 index 0000000..19bc3ad --- /dev/null +++ b/multic/cli/ReferenceFeatureExtraction/ReferenceFeatureExtraction.xml @@ -0,0 +1,47 @@ + + + HistomicsTK + Extract Extended Clinical Features + Extract reference features + 0.1.0 + https://github.com/SarderLab/Multi-Compartment-Segmentation + Apache 2.0 + Sayat Mimar (UFL) + This work is part of efforts in digital pathology by the Sarder Lab: UFL. + + + Input/output parameters + + input_file + + input file + input + 0 + + + base_dir + + Base Directory for the model + input + 1 + + + + + A Girder API URL and token for Girder client + + girderApiUrl + api-url + + A Girder API URL (e.g., https://girder.example.com:443/api/v1) + + + + girderToken + token + + A Girder token + + + + \ No newline at end of file diff --git a/multic/cli/slicer_cli_list.json b/multic/cli/slicer_cli_list.json index 1189b00..24a3824 100644 --- a/multic/cli/slicer_cli_list.json +++ b/multic/cli/slicer_cli_list.json @@ -4,5 +4,8 @@ }, "FeatureExtraction": { "type" : "python" + }, + "ReferenceFeatureExtraction": { + "type" : "python" } } diff --git a/multic/segmentationschool/Codes/extract_reference_features.py b/multic/segmentationschool/Codes/extract_reference_features.py index fb9cdb5..3f048d1 100644 --- a/multic/segmentationschool/Codes/extract_reference_features.py +++ b/multic/segmentationschool/Codes/extract_reference_features.py @@ -1,62 +1,66 @@ -import os, sys, cv2, time +import os, cv2 import numpy as np -import matplotlib.pyplot as plt + import lxml.etree as ET + from matplotlib import path -# import matplotlib.patches as patches -import glob -from .xml_to_mask_minmax import write_minmax_to_xml,get_annotated_ROIs,xml_to_mask +from skimage.color import rgb2lab,rgb2hsv + +from .xml_to_mask_minmax import write_minmax_to_xml import xlsxwriter import multiprocessing from scipy.ndimage.morphology import distance_transform_edt from scipy.ndimage import binary_fill_holes -# from skimage.transform import resize -# from skimage.util import img_as_ubyte + from tqdm import tqdm import time -import openslide +from tiffslide import TiffSlide from joblib import Parallel, delayed -from skimage.color import rgb2lab,rgb2hsv -# from skimage.io import imsave -# from skimage.morphology import remove_small_objects,binary_erosion,binary_dilation,disk,binary_opening -#from skimage.filters.rank import entropy -# from scipy.ndimage.filters import generic_filter +from skimage.color import rgb2hsv + from skimage.filters import * -#Glom density, Tubule density, -#Interstitial capillary density, percentage glomerulosclerosis, -# # luminal diameter over vessel diameter -# from wsi_loader_utils import get_choppable_regions -# from PIL import Image -from matplotlib import cm -# from matplotlib.colors import ListedColormap, LinearSegmentedColormap -# from scipy.stats import zscore -import pandas as pd -import seaborn as sns -# from PAS_deconvolution import deconvolution -# import warnings def getKidneyReferenceFeatures(args): - assert args.target is not None, 'Directory of xmls must be specified, use --target /path/to/files.xml' - assert args.wsis is not None, 'Directory of WSIs must be specified, use --wsis /path/to/wsis' + folder = args.base_dir + + # assert args.target is not None, 'Directory of xmls must be specified, use --target /path/to/files.xml' + # assert args.wsis is not None, 'Directory of WSIs must be specified, use --wsis /path/to/wsis' + if args.platform == 'DSA': + import girder_client + gc = girder_client.GirderClient(apiUrl=args.girderApiUrl) + gc.setToken(args.girderToken) + + file_name = args.file.split('/')[-1] + slide_item_id = args.item_id + output_dir = args.base_dir + slide_name,slideExt=file_name.split('.') + items=[(args.file, args.xml_path)] + + elif args.platform == 'HPG': + image_files = [image_name for _, _, files in os.walk(folder) for image_name in files if image_name.endswith('.xml')] + image_names = [os.path.join(folder, f.split('.')[0]) for f in image_files] + slideExt = args.ext + output_dir = args.output_dir + # each item in items is a tuple of (images, annotations) + items = [(f + slideExt, f + '.xml') for f in image_names] + + else: + raise Exception("Please Enter a valid Platform, DSA or HPG") + + for i in range(len(items)): + + svsfile, xmlfile = items[i] - annotatedXMLs=glob.glob(os.path.join(args.target, "*.xml")) - for xml in tqdm(annotatedXMLs): - # print(xml,end='\r',flush=True) + print(xmlfile,'here') + write_minmax_to_xml(xmlfile) - print(xml) - write_minmax_to_xml(xml) - for ext in args.wsi_ext.split(','): - if os.path.isfile(os.path.join(args.wsis,xml.split('/')[-1].replace('.xml',ext))): - wsi=os.path.join(args.wsis,xml.split('/')[-1].replace('.xml',ext)) - break - slideID,slideExt=os.path.splitext(wsi.split('/')[-1]) all_contours = {'1':[],'2':[],'3':[],'4':[],'5':[],'6':[]} # cortex medulla glomeruli scl_glomeruli tubules arteries(ioles) - tree = ET.parse(xml) + tree = ET.parse(xmlfile) root = tree.getroot() - basename=os.path.splitext(xml)[0] + basename=os.path.splitext(xmlfile)[0] for Annotation in root.findall("./Annotation"): # for all annotations annotationID = Annotation.attrib['Id'] if annotationID not in ['1','2','3','4','5','6']: @@ -75,14 +79,14 @@ def getKidneyReferenceFeatures(args): cortexarea=0 medullaarea=0 - slide=openslide.OpenSlide(wsi) + slide=TiffSlide(svsfile) if slideExt =='.scn': - dim_x=int(slide.properties['openslide.bounds-width'])## add to columns - dim_y=int(slide.properties['openslide.bounds-height'])## add to rows - offsetx=int(slide.properties['openslide.bounds-x'])##start column - offsety=int(slide.properties['openslide.bounds-y'])##start row + dim_x=int(slide.properties['tiffslide.bounds-width'])## add to columns + dim_y=int(slide.properties['tiffslide.bounds-height'])## add to rows + offsetx=int(slide.properties['tiffslide.bounds-x'])##start column + offsety=int(slide.properties['tiffslide.bounds-y'])##start row - elif slideExt in ['.ndpi','.svs']: + elif slideExt in ['.ndpi','.svs','.tiff']: dim_x, dim_y=slide.dimensions offsetx=0 offsety=0 @@ -112,15 +116,7 @@ def getKidneyReferenceFeatures(args): binary=binary_fill_holes(binary) total_tissue_area=np.sum(binary)*resRatio*resRatio - # imsave(wsi.replace(slideExt,'.jpeg'),thumbIm) - # with warnings.catch_warnings(): - # warnings.simplefilter("ignore") - # imsave(wsi.replace(slideExt,'.jpeg'),thumbIm) - # imsave(wsi.replace(slideExt,'_b.jpeg'),binary.astype('uint8')*255) - # - # continue - - + for contour in all_contours['1']: cortexcontour.extend(contour) cortexarea+=cv2.contourArea(contour) @@ -132,17 +128,6 @@ def getKidneyReferenceFeatures(args): cortex_path=path.Path(cortexcontour,codes=cortexcodes) else: cortex_path=None - # cortexcontour=np.array(cortexcontour) - # pathxmin=np.min(cortexcontour[:,0]) - # pathxmax=np.max(cortexcontour[:,0]) - # pathymin=np.min(cortexcontour[:,1]) - # pathymax=np.max(cortexcontour[:,1]) - # fig, ax = plt.subplots() - # patch = patches.PathPatch(cortex_path, facecolor='orange', lw=2) - # ax.add_patch(patch) - # ax.set_xlim(pathxmin, pathxmax) - # ax.set_ylim(pathymin, pathymax) - # plt.show() for contour in all_contours['2']: medullacontour.extend(contour) @@ -157,7 +142,8 @@ def getKidneyReferenceFeatures(args): medulla_path=None pseudocortexarea=total_tissue_area-medullaarea # - workbook=xlsxwriter.Workbook(xml.replace('.xml','.xlsx')) + xlsx_path = os.path.join(output_dir, os.path.basename(svsfile).split('.')[0] +'_extended_clinical'+'.xlsx') + workbook=xlsxwriter.Workbook(xlsx_path) worksheet1 = workbook.add_worksheet('Summary') worksheet2 = workbook.add_worksheet('Interstitium') worksheet3 = workbook.add_worksheet('Glomeruli') @@ -188,30 +174,30 @@ def getKidneyReferenceFeatures(args): args,args.min_size[4],cortex_path,medulla_path) for points in tqdm(all_contours['5'],colour='blue',unit='Tubule',leave=False)) art_features=Parallel(n_jobs=cores)(delayed(points_to_features_art)(points, - args,args.min_size[5],cortex_path,medulla_path,wsi,MOD) for points in tqdm(all_contours['6'],colour='magenta',unit='Artery(-iole)',leave=False)) + args,args.min_size[5],cortex_path,medulla_path,svsfile,MOD) for points in tqdm(all_contours['6'],colour='magenta',unit='Artery(-iole)',leave=False)) print('Generating output file..') - glom_features=[i for i in glom_features if i is not None] - sglom_features=[i for i in sglom_features if i is not None] - tub_features=[i for i in tub_features if i is not None] - art_features=[i for i in art_features if i is not None] + glom_features=np.array([i for i in glom_features if i is not None]) + sglom_features=np.array([i for i in sglom_features if i is not None]) + tub_features=np.array([i for i in tub_features if i is not None]) + art_features=np.array([i for i in art_features if i is not None]) # gloms_features=[i for i in glom_features if i[0]>args.min_sizes[2]] # sglom_features=[i for i in sglom_features if i[0]>args.min_sizes[3]] # cortexgloms=[i for i in glom_features if not i[3]] - cortextubs=[i for i in tub_features if not i[3]] - cortexarts=[i for i in art_features if not i[3]] + cortextubs=np.array([i for i in tub_features if not i[3]]) + cortexarts=np.array([i for i in art_features if not i[3]]) - medullatubs=[i for i in tub_features if i[3]] - medullaarts=[i for i in art_features if i[3]] + medullatubs=np.array([i for i in tub_features if i[3]]) + medullaarts=np.array([i for i in art_features if i[3]]) if pseudocortexarea>0: cortex_glom_area=np.sum(np.array(glom_features)[:,0]) cortex_glom_density=float(cortex_glom_area)/float(pseudocortexarea) - cortex_tub_area=np.sum(np.array(cortextubs)[:,0]) + cortex_tub_area=np.sum(cortextubs[:,0]) cortex_tub_density=float(cortex_tub_area)/float(pseudocortexarea) - cortex_art_area=np.sum(np.array(cortexarts)[:,0]) + cortex_art_area=np.sum(cortexarts[:,0]) cortex_art_density=float(cortex_art_area)/float(pseudocortexarea) # downsample_cortex=get_downsample_cortex(args,all_contours['1']) # exit() @@ -223,9 +209,9 @@ def getKidneyReferenceFeatures(args): cortex_art_density=None if medullaarea>0 and len(medullatubs)>0: - medulla_tub_area=np.sum(np.array(medullatubs)[:,0]) + medulla_tub_area=np.sum(medullatubs[:,0]) if len(medullaarts)>0: - medulla_art_area=np.sum(np.array(medullaarts)[:,0]) + medulla_art_area=np.sum(medullaarts[:,0]) medulla_art_density=float(medulla_art_area)/float(medullaarea) else: medulla_art_density=None @@ -233,11 +219,7 @@ def getKidneyReferenceFeatures(args): else: medulla_tub_density=None medulla_art_density=None - glom_features=np.array(glom_features) - sglom_features=np.array(sglom_features) - tub_features=np.array(tub_features) - art_features=np.array(art_features) - cortexarts=np.array(cortexarts) + worksheet1.write(0,0,'Glomerular density - count:') worksheet1.write(0,1,len(glom_features)/pseudocortexarea) worksheet1.write(1,0,'Average glomerular area:') @@ -280,11 +262,16 @@ def getKidneyReferenceFeatures(args): worksheet1.write(13,1,np.mean(cortextubs[:,1])) worksheet1.write(14,1,np.std(cortextubs[:,1])) if medullaarea>0: - medullatubs=np.array(medullatubs) - worksheet1.write(15,1,np.mean(medullatubs[:,0])) - worksheet1.write(16,1,np.std(medullatubs[:,0])) - worksheet1.write(17,1,np.mean(medullatubs[:,1])) - worksheet1.write(18,1,np.std(medullatubs[:,1])) + if len(medullatubs)>0: + worksheet1.write(15,1,np.mean(medullatubs[:,0])) + worksheet1.write(16,1,np.std(medullatubs[:,0])) + worksheet1.write(17,1,np.mean(medullatubs[:,1])) + worksheet1.write(18,1,np.std(medullatubs[:,1])) + else: + worksheet1.write(15,1,0) + worksheet1.write(16,1,0) + worksheet1.write(17,1,0) + worksheet1.write(18,1,0) worksheet1.write(19,0,'Cortical arterial(olar) density') worksheet1.write(19,1,len(cortexarts)/pseudocortexarea) @@ -331,438 +318,84 @@ def getKidneyReferenceFeatures(args): worksheet1.write(33,1,len(glom_features)) worksheet1.write(34,0,'sGlomerulus count') worksheet1.write(34,1,len(sglom_features)) - worksheet3.write(0,0,'Area') - worksheet3.write(0,1,'Radius') + + worksheet3.write(0,0,'Area (pixel^2)') + worksheet3.write(0,1,'Radius (pixel)') + worksheet3.write(0,2,'x1') + worksheet3.write(0,3,'x2') + worksheet3.write(0,4,'y1') + worksheet3.write(0,5,'y2') + for idx,glom in enumerate(glom_features): worksheet3.write(idx+1,0,glom[0]) worksheet3.write(idx+1,1,glom[1]) - worksheet4.write(0,0,'Area') - worksheet4.write(0,1,'Radius') + worksheet3.write(idx+1,2,glom[4]) + worksheet3.write(idx+1,3,glom[5]) + worksheet3.write(idx+1,4,glom[6]) + worksheet3.write(idx+1,5,glom[7]) + + + worksheet4.write(0,0,'Area (pixel^2)') + worksheet4.write(0,1,'Radius (pixel)') + + worksheet4.write(0,2,'x1') + worksheet4.write(0,3,'x2') + worksheet4.write(0,4,'y1') + worksheet4.write(0,5,'y2') + for idx,sglom in enumerate(sglom_features): worksheet4.write(idx+1,0,sglom[0]) worksheet4.write(idx+1,1,sglom[1]) - worksheet5.write(0,0,'Area') - worksheet5.write(0,1,'Radius') + worksheet4.write(idx+1,2,sglom[4]) + worksheet4.write(idx+1,3,sglom[5]) + worksheet4.write(idx+1,4,sglom[6]) + worksheet4.write(idx+1,5,sglom[7]) + + worksheet5.write(0,0,'Area (pixel^2)') + worksheet5.write(0,1,'Radius (pixel)') + worksheet5.write(0,2,'In Medulla') + + worksheet5.write(0,3,'x1') + worksheet5.write(0,4,'x2') + worksheet5.write(0,5,'y1') + worksheet5.write(0,6,'y2') for idx,tub in enumerate(tub_features): worksheet5.write(idx+1,0,tub[0]) worksheet5.write(idx+1,1,tub[1]) + worksheet5.write(idx+1,2,tub[3]) - worksheet5.write(0,2,'Cortical areas') - worksheet5.write(0,3,'Cortical radii') - for idx,tub in enumerate(cortextubs): - worksheet5.write(idx+1,2,tub[0]) - worksheet5.write(idx+1,3,tub[1]) - - worksheet5.write(0,4,'Medullary areas') - worksheet5.write(0,5,'Medullary radii') - for idx,tub in enumerate(medullatubs): - worksheet5.write(idx+1,4,tub[0]) - worksheet5.write(idx+1,5,tub[1]) + worksheet5.write(idx+1,3,tub[4]) + worksheet5.write(idx+1,4,tub[5]) + worksheet5.write(idx+1,5,tub[6]) + worksheet5.write(idx+1,6,tub[7]) - worksheet6.write(0,0,'Area') - worksheet6.write(0,1,'Radius') + worksheet6.write(0,0,'Area (pixel^2)') + worksheet6.write(0,1,'Radius (pixel)') worksheet6.write(0,2,'Luminal ratio') + + worksheet6.write(0,3,'x1') + worksheet6.write(0,4,'x2') + worksheet6.write(0,5,'y1') + worksheet6.write(0,6,'y2') for idx,art in enumerate(art_features): worksheet6.write(idx+1,0,art[0]) worksheet6.write(idx+1,1,art[1]) worksheet6.write(idx+1,2,art[4]) - workbook.close() - print('Done.') - # exit() - # Parallel(n_jobs=num_cores, backend='threading')(delayed(chop_wsi)(, choppable_regions=choppable_regions) for idxx, j in enumerate(index_x)) - - -def summarizeKidneyReferenceFeatures(args): - assert args.target is not None, 'Directory of xmls must be specified, use --target /path/to/files.xml' - assert args.SummaryOption is not None, 'You must specify what type of summary is required with --SummaryOption' - assert args.patientData is not None, 'You must provide patient metadata xlsx file with --patientData' - if args.SummaryOption in ['ULDensity']: - ULDensity(args) - elif args.SummaryOption in ['BLDensity']: - BLDensity(args) - elif args.SummaryOption in ['standardScatter']: - standardScatter(args) - elif args.SummaryOption in ['anchorScatter']: - anchorScatter(args) - elif args.SummaryOption in ['aggregate']: - aggregate(args) - elif args.SummaryOption in ['JoyPlot']: - JoyPlot(args) - else: - print('Incorrect SummaryOption') - -def anchorScatter(args): - patientData=pd.read_excel(args.patientData,usecols=[args.labelColumns,args.IDColumn,args.anchor],index_col=None) - patientMetrics={} - for idx,patientID in enumerate(patientData[args.IDColumn]): - patientMetrics[patientID]=[patientData[args.labelColumns][idx],patientData[args.anchor][idx]] - - datafiles=glob.glob(os.path.join(args.target, "*.xlsx")) - clinical_legend=[] - clinical_anchor=[] - for datafile in datafiles: - patientID=os.path.splitext(datafile.split('/')[-1])[0] - clinical_legend.append(str(patientMetrics[patientID][0])) - clinical_anchor.append(patientMetrics[patientID][1]) - - sortedPatientOrder=[x for _, x in sorted(zip(clinical_legend,datafiles))] - sortedPatientLegend=[x for x,_ in sorted(zip(clinical_legend,datafiles))] - sortedPatientAnchor=[x for x,_ in sorted(zip(clinical_anchor,datafiles))] - f1,f2=[int(i) for i in args.scatterFeatures.split(',')] - # f1=5 - # f2=7 - - temp=pd.read_excel(datafiles[0],sheet_name='Summary',header=None,index_row=None,index_col=0) - index = temp.index - xlabel=index[f1-1] - ylabel=args.anchor - - popIdx=[] - scatter_features=[] - for idx,datafile in enumerate(tqdm(sortedPatientOrder,colour='red')): - features=np.array(pd.read_excel(datafile,sheet_name='Summary',header=None,index_row=None,index_col=0)) - # print(np.array(features)) - if not np.isnan(features[f1-1]) and not np.isnan(features[f2-1]): - scatter_features.append([features[f1-1][0],features[f2-1][0]]) - else: - popIdx.append(idx) - for p in popIdx[::-1]: - sortedPatientLegend.pop(p) - sortedPatientAnchor.pop(p) - scatter_features=np.array(scatter_features) - - sns.scatterplot(x=scatter_features[:,0],y=sortedPatientAnchor,hue=sortedPatientLegend,palette='viridis',legend='auto') - plt.xlabel(xlabel) - plt.ylabel(ylabel) - plt.legend(title=args.labelColumns) - plt.show() -def standardScatter(args): - patientData=pd.read_excel(args.patientData,usecols=[args.labelColumns,args.IDColumn],index_col=None,converters={args.IDColumn:str}) - patientMetrics={} - assert args.labelColumns in ['Age','Cr','Sex'], 'Label column must be Age, Cr, or Sex for standard scatter' - if args.labelColumns=='Age': - labelBins=[0,10,20,30,40,50,60,70,80,90,100] - elif args.labelColumns=='Cr': - labelBins=[0,.5,.6,.7,.8,.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2] - elif args.labelColumns=='Sex': - labelBins=None - - for idx,patientID in enumerate(patientData[args.IDColumn]): - patientMetrics[patientID]=patientData[args.labelColumns][idx] - - datafiles=glob.glob(os.path.join(args.target, "*.xlsx")) - clinical_legend=[] - for datafile in datafiles: - patientID=os.path.splitext(datafile.split('/')[-1])[0] - clinical_legend.append(str(patientMetrics[patientID])) - - sortedPatientOrder=[x for _, x in sorted(zip(clinical_legend,datafiles))] - sortedPatientLegend=[x for x,_ in sorted(zip(clinical_legend,datafiles))] - print(sortedPatientLegend) - f1,f2=[int(i) for i in args.scatterFeatures.split(',')] - # f1=5 - # f2=7 - - temp=pd.read_excel(datafiles[0],sheet_name='Summary',header=None,index_row=None,index_col=0) - index = temp.index - xlabel=index[f1-1] - ylabel=index[f2-1] - - popIdx=[] - scatter_features=[] - for idx,datafile in enumerate(tqdm(sortedPatientOrder,colour='red')): - features=np.array(pd.read_excel(datafile,sheet_name='Summary',header=None,index_row=None,index_col=0)) - # print(np.array(features)) - if not np.isnan(features[f1-1]) and not np.isnan(features[f2-1]): - scatter_features.append([features[f1-1][0],features[f2-1][0]]) - else: - popIdx.append(idx) - for p in popIdx[::-1]: - sortedPatientLegend.pop(p) - sortedPatientLegend=np.array(sortedPatientLegend) - - scatter_features=np.array(scatter_features) - - sns.scatterplot(x=scatter_features[:,0],y=scatter_features[:,1],hue=sortedPatientLegend,palette='viridis') - plt.xlabel(xlabel) - plt.ylabel(ylabel) - plt.legend(sortedPatientLegend,title=args.labelColumns) - plt.show() - -def BLDensity(args): - patientData=pd.read_excel(args.patientData,usecols=[args.labelColumns,args.IDColumn],index_col=None) - patientMetrics={} - for idx,patientID in enumerate(patientData[args.IDColumn]): - patientMetrics[patientID]=patientData[args.labelColumns][idx] - - datafiles=glob.glob(os.path.join(args.target, "*.xlsx")) - clinical_legend=[] - for datafile in datafiles: - patientID=os.path.splitext(datafile.split('/')[-1])[0] - clinical_legend.append(str(patientMetrics[patientID])) - - sortedPatientOrder=[x for _, x in sorted(zip(clinical_legend,datafiles))] - sortedPatientLegend=[x for x,_ in sorted(zip(clinical_legend,datafiles))] - - fig,ax=plt.subplots() - plt.xlabel('Cortical tubular diameter (µm)') - plasma = cm.get_cmap('plasma',len(sortedPatientOrder)) - popIdx=[] - for idx,datafile in enumerate(tqdm(sortedPatientOrder)): - tubule_features=pd.read_excel(datafile,sheet_name='Tubules',usecols=['Cortical radii','Cortical areas'],index_col=None).dropna() - if len(tubule_features['Cortical radii'])>0: - # xl1=np.percentile(tubule_features['Cortical radii']*0.25,0.01) - xl2=np.percentile(tubule_features['Cortical radii']*0.25,99) - # yl1=np.percentile(tubule_features['Cortical areas']*0.25*0.25,0.01) - yl2=np.percentile(tubule_features['Cortical areas']*0.25*0.25,99) - # sns.kdeplot(tubule_features['Cortical radii']*0.25,ax=ax,clip=[l1,l2],bw_adjust=0.75,color=plasma(idx),fill=args.plotFill) - sns.kdeplot(x=tubule_features['Cortical radii']*0.25,y=tubule_features['Cortical areas']*0.25*0.25, - color=plasma(idx),ax=ax,clip=[[None,xl2],[None,yl2]]) - else: - popIdx.append(idx) - for p in popIdx[::-1]: - sortedPatientLegend.pop(p) - plt.legend(sortedPatientLegend,title=args.labelColumns) - plt.show() - -def ULDensity(args): - - if args.labelModality=='Continuous': - patientData=pd.read_excel(args.patientData,usecols=[args.labelColumns,args.IDColumn],index_col=None,converters={args.IDColumn:str}) - patientMetrics={} - # print(patientData) - for idx,patientID in enumerate(patientData[args.IDColumn]): - patientMetrics[patientID]=patientData[args.labelColumns][idx] - - datafiles=glob.glob(os.path.join(args.target, "*.xlsx")) - clinical_legend=[] - for datafile in datafiles: - patientID=os.path.splitext(datafile.split('/')[-1])[0] - clinical_legend.append(str(patientMetrics[patientID])) - - sortedPatientOrder=[x for _, x in sorted(zip(clinical_legend,datafiles))] - sortedPatientLegend=[x for x,_ in sorted(zip(clinical_legend,datafiles))] - - fig,ax=plt.subplots() - plt.xlabel('Cortical tubular diameter (µm)') - plasma = cm.get_cmap('plasma',len(sortedPatientOrder)) - popIdx=[] - all_tubules=[] - featurename='Cortical radii' - sheetname='Tubules' - for idx,datafile in enumerate(tqdm(sortedPatientOrder,colour='red')): - tubule_features=pd.read_excel(datafile,sheet_name=sheetname,usecols=[featurename],index_col=None).dropna() - if len(tubule_features[featurename])>0: - l1=np.percentile(tubule_features[featurename]*0.25,0.05) - l2=np.percentile(tubule_features[featurename]*0.25,99.5) - sns.kdeplot(tubule_features[featurename]*0.25,ax=ax,clip=[l1,l2],bw_adjust=0.75,color=plasma(idx),fill=args.plotFill,alpha=0.1) - # sns.kdeplot(tubule_features[featurename]*0.25,ax=ax,bw_adjust=0.75,color=plasma(idx),fill=args.plotFill,alpha=0.1) - all_tubules.extend(np.array(tubule_features[featurename]*0.25)) - else: - popIdx.append(idx) - for p in popIdx[::-1]: - sortedPatientLegend.pop(p) - - plt.legend(sortedPatientLegend,title=args.labelColumns) - plt.title('Cumulative distributions per patient') - # plt.colorbar() - plt.show() - # print(len(all_tubules)) - # fig,ax=plt.subplots() - # plt.xlabel('Cortical tubular diameter (µm)') - # sns.kdeplot(all_tubules,ax=ax,bw_adjust=0.75,fill=args.plotFill) - # plt.title('Cumulative distribution for dataset') - # plt.show() - elif args.labelModality=='Categorical': - patientData=pd.read_excel(args.patientData,usecols=[args.labelColumns,args.IDColumn],index_col=None,converters={args.IDColumn:str}) - patientMetrics={} - for idx,patientID in enumerate(patientData[args.IDColumn]): - patientMetrics[patientID]=patientData[args.labelColumns][idx] - - datafiles=glob.glob(os.path.join(args.target, "*.xlsx")) - clinical_legend=[] - for datafile in datafiles: - patientID=os.path.splitext(datafile.split('/')[-1])[0] - - clinical_legend.append(str(patientMetrics[patientID])) - - fig,ax=plt.subplots() - plt.xlabel('Cortical tubular diameter (µm)') - plasma = cm.get_cmap('plasma',len(np.unique(clinical_legend))+1) - labelMapper={} - categories=np.unique(clinical_legend) - for idx,l in enumerate(categories): - labelMapper[l]=idx - popIdx=[] - all_tubules=[] - featurename='Radius' - sheetname='Glomeruli' - for idx,datafile in enumerate(tqdm(datafiles,colour='red')): - tubule_features=pd.read_excel(datafile,sheet_name=sheetname,usecols=[featurename],index_col=None).dropna() - if len(tubule_features[featurename])>0: - l1=np.percentile(tubule_features[featurename]*0.25,0.01) - l2=np.percentile(tubule_features[featurename]*0.25,99.9) - - pcolor=plasma(labelMapper[clinical_legend[idx]]) - # sns.kdeplot(tubule_features[featurename]*0.25,ax=ax,clip=[l1,l2],bw_adjust=0.75,color=pcolor,fill=args.plotFill,alpha=0.5) - sns.kdeplot(tubule_features[featurename]*0.25,ax=ax,bw_adjust=0.75,color=pcolor,fill=args.plotFill,alpha=0.5) - - all_tubules.extend(np.array(tubule_features[featurename]*0.25)) - else: - popIdx.append(idx) - for p in popIdx[::-1]: - clinical_legend.pop(p) - plt.legend(clinical_legend,title=args.labelColumns) - plt.title('Cumulative distributions per patient') - plt.show() - # fig,ax=plt.subplots() - # plt.xlabel('Cortical tubular diameter (µm)') - # sns.kdeplot(all_tubules,ax=ax,bw_adjust=0.75,fill=args.plotFill) - # plt.title('Cumulative distribution for dataset') - # plt.show() - - -def JoyPlot(args): - - url = "https://gist.githubusercontent.com/borgar/31c1e476b8e92a11d7e9/raw/0fae97dab6830ecee185a63c1cee0008f6778ff6/pulsar.csv" - df = pd.read_csv(url, header=None) - df = df.stack().reset_index() - df.columns = ['idx', 'x', 'y'] - - patientData=pd.read_excel(args.patientData,usecols=[args.labelColumns,args.IDColumn],index_col=None,converters={args.IDColumn:str}) - patientMetrics={} - for idx,patientID in enumerate(patientData[args.IDColumn]): - patientMetrics[patientID]=patientData[args.labelColumns][idx] - datafiles=glob.glob(os.path.join(args.target, "*.xlsx")) - clinical_legend=[] - for datafile in datafiles: - patientID=os.path.splitext(datafile.split('/')[-1])[0] - clinical_legend.append(str(patientMetrics[patientID])) - sortedPatientOrder=[x for _, x in sorted(zip(clinical_legend,datafiles))] - sortedPatientLegend=[x for x,_ in sorted(zip(clinical_legend,datafiles))] + worksheet6.write(idx+1,3,art[5]) + worksheet6.write(idx+1,4,art[6]) + worksheet6.write(idx+1,5,art[7]) + worksheet6.write(idx+1,6,art[8]) - popIdx=[] - all_tubules=[] - # outDF=pd.DataFrame() - - for idx,datafile in enumerate(tqdm(sortedPatientOrder[:15],colour='red')): - - tubule_features=pd.read_excel(datafile,sheet_name='Tubules',usecols=['Cortical radii'],index_col=None).dropna() - if len(tubule_features['Cortical radii'])>0: - #.values() - feature_array=np.array(tubule_features['Cortical radii']*0.25) - all_tubules.append(feature_array) + workbook.close() + if args.platform == 'DSA': + gc.uploadFileToItem(slide_item_id, xlsx_path, reference=None, mimeType=None, filename=None, progressCallback=None) + print('Girder file uploaded!') - l1=np.percentile(feature_array,0.05) - l2=np.percentile(feature_array,99.5) - # sns.kdeplot(tubule_features['Cortical radii']*0.25,ax=ax,clip=[l1,l2],bw_adjust=0.75,color=plasma(idx),fill=args.plotFill,cbar=True) - # all_tubules.extend(np.array(tubule_features['Cortical radii']*0.25)) + print('Done.') - else: - popIdx.append(idx) - for p in popIdx[::-1]: - sortedPatientLegend.pop(p) - outDF=pd.DataFrame(all_tubules) - - outDF=outDF.stack(dropna=False).reset_index().dropna() - - # outDFT=outDF.transpose() - # input(outDF) - # input(outDF.reset_index()) - # input(outDF.reset_index(drop=True)[::2].reset_index(drop=True)) - # input(outDF.reset_index(drop=True)) - # input(outDF.stack()) - # input(outDF.reset_index().stack()) - # input(outDF.reset_index(drop=True).stack()) - # outDF=outDF.reset_index(drop=True)[::2].reset_index(drop=True) - # outDF = outDF.stack().reset_index() - outDF.columns = ['idx', 'x', 'y'] - outDF.to_csv('test.csv') - # print(outDF.dropna()) - # exit() - - - - - sns.set_theme(rc={"axes.facecolor": (0, 0, 0,0), 'figure.facecolor':'#ffffff', 'axes.grid':False}) - g = sns.FacetGrid(outDF, row='idx',hue="idx", aspect=15) - g.map(sns.kdeplot, "x", - bw_adjust=.9, clip_on=False, - fill=True, alpha=1, linewidth=1.5) - g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw_adjust=.9) - g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False) - def label(x, color, label): - ax = plt.gca() - ax.text(0, .2, label, fontweight="bold", color=color, - ha="left", va="center", transform=ax.transAxes) - g.map(label, "x") - - # Draw the densities in a few steps - # g.map(sns.lineplot, 'x', 'y', clip_on=False, alpha=1, linewidth=1.5) - # g.map(plt.fill_between, 'x', 'y', color='#000000') - # g.map(sns.lineplot, 'x', 'y', clip_on=False, color='#ffffff', lw=2) - # Set the subplots to overlap - g.fig.subplots_adjust(hspace=-0.95) - g.set_titles("") - g.set(yticks=[], xticks=[], ylabel="", xlabel="") - g.despine(bottom=True, left=True) - plt.show() - # plt.savefig('joy.png', facecolor='#000000') - -def aggregate(args): - assert args.exceloutfile is not None, 'You must provide a name of xlsx output file for feature aggregation with --exceloutfile name.xlsx' - usecols=args.labelColumns.split(',') - # index_col=int(args.IDColumn) - patientData=pd.read_excel(args.patientData,usecols=usecols.extend(args.IDColumn),index_col=args.IDColumn) - patientData = patientData.loc[:, ~patientData.columns.str.contains('^Unnamed')] - patientMetrics={} - - pd.set_option('display.max_rows', 100) - patientData.index = patientData.index.map(str) - - datafiles=glob.glob(os.path.join(args.target, "*.xlsx")) - # numGloms=0 - # numSGloms=0 - # numTubules=0 - # numArts=0 - full_data={} - for idx,datafile in enumerate(tqdm(datafiles,colour='red')): - - xlsxid=datafile.split('/')[-1].split('.xlsx')[0] - tubule_features=pd.read_excel(datafile,sheet_name='Summary',header=None,index_col=None).transpose() - names=np.array(tubule_features.iloc[0]) - vals=np.array(tubule_features.iloc[1]) - # numGloms+=vals[0] - # if not np.isnan(vals[5]): - # numSGloms+=vals[5] - # numTubules+=vals[10] - # numArts+=vals[19] - - full_data[xlsxid]=np.append(vals,np.array(patientData.loc[xlsxid]),0) - # print(numGloms) - # print(numSGloms) - # print(numTubules) - # print(numArts) - # exit() - workbook=xlsxwriter.Workbook(args.exceloutfile,{'nan_inf_to_errors': True}) - worksheet1 = workbook.add_worksheet('Aggregation') - - outcolnames=np.append(names,patientData.columns,0) - outrownames=full_data.keys() - for idx,outcol in enumerate(outcolnames): - worksheet1.write(0,idx+1,outcol) - - rowcounter=1 - for key,vals in full_data.items(): - worksheet1.write(rowcounter,0,key) - for idx,val in enumerate(vals): - worksheet1.write(rowcounter,idx+1,val) - rowcounter+=1 - - workbook.close() def points_to_features_glom(points,args,min_size,cortex,medulla): a=cv2.contourArea(points) if a>min_size: @@ -782,7 +415,7 @@ def points_to_features_glom(points,args,min_size,cortex,medulla): dist=distance_transform_edt(binary_mask) - return [a,np.max(dist),None,containedmedulla] + return [a,np.max(dist),None,containedmedulla,yMin,yMax,xMin,xMax] def points_to_features_tub(points,args,min_size,cortex,medulla): a=cv2.contourArea(points) @@ -802,7 +435,7 @@ def points_to_features_tub(points,args,min_size,cortex,medulla): binary_mask=cv2.fillPoly(binary_mask,[points],1) dist=distance_transform_edt(binary_mask) - return [a,np.max(dist),None,containedmedulla] + return [a,np.max(dist),None,containedmedulla,yMin,yMax,xMin,xMax] def points_to_features_art(points,args,min_size,cortex,medulla,wsi,MOD): @@ -818,10 +451,10 @@ def points_to_features_art(points,args,min_size,cortex,medulla,wsi,MOD): else: containedmedulla=False xMin,xMax,yMin,yMax=[np.min(points[:,0]),np.max(points[:,0]),np.min(points[:,1]),np.max(points[:,1])] - image=np.array(openslide.OpenSlide(wsi).read_region((xMin,yMin),0,(xMax-xMin,yMax-yMin)))[:,:,:3] + image=np.array(TiffSlide(wsi).read_region((xMin,yMin),0,(xMax-xMin,yMax-yMin)))[:,:,:3] if xMax-xMin>5000 or yMax-yMin>5000: - return [a,None,None,containedmedulla,None] + return [a,None,None,containedmedulla,None, yMin,yMax,xMin,xMax] binary_mask=np.zeros((int(yMax-yMin),int(xMax-xMin))) points[:,0]-=xMin points[:,1]-=yMin @@ -875,11 +508,11 @@ def points_to_features_art(points,args,min_size,cortex,medulla,wsi,MOD): # # return [a,distMax*2,containedcortex,containedmedulla,(np.max(WSdist)*2)/(distMax*2)] # else: - return [a,distMax,None,containedmedulla,np.max(WSdist)/distMax] + return [a,distMax,None,containedmedulla,np.max(WSdist)/distMax,yMin,yMax,xMin,xMax] -# full_list.append(tubule_features) -# full_list= pd.concat(full_list) -# print(full_list) -# exit() +# # full_list.append(tubule_features) +# # full_list= pd.concat(full_list) +# # print(full_list) +# # exit() diff --git a/multic/segmentationschool/extraction_utils/process_mc_features.py b/multic/segmentationschool/extraction_utils/process_mc_features.py index 2f1f759..134d618 100644 --- a/multic/segmentationschool/extraction_utils/process_mc_features.py +++ b/multic/segmentationschool/extraction_utils/process_mc_features.py @@ -5,31 +5,26 @@ from skimage.morphology import remove_small_objects,binary_erosion,binary_dilation,disk,binary_opening,binary_closing from skimage.filters import threshold_otsu from skimage import measure - +import cv2 from .extract_ffpe_features import imreconstruct from .PAS_deconvolution import deconvolution -GLOM_DICT = {3:'Glomeruli',4:'Sclerotic Glomeruli'} - -def process_glom_features(mask_xml, glom_value, MOD, slide, mpp, h_threshold, saturation_threshold): - glomeruli = mask_xml == glom_value - glomeruli = glomeruli.astype(np.uint8) - glomeruli = measure.label(glomeruli) - glomeruli_unique_max = np.max(glomeruli) - gloms = np.zeros((glomeruli_unique_max,7)) - props = measure.regionprops(glomeruli) - - for i in tqdm(range(glomeruli_unique_max),desc=GLOM_DICT[glom_value]): - #Area, Cellularity, Mesangial Area to Cellularity Ratio - area = (props[i].area)*(mpp**2) - x1,y1,x2,y2 = props[i].bbox +def process_glom_features(points, MOD, slide, h_threshold, saturation_threshold): + + #Area, Cellularity, Mesangial Area to Cellularity Ratio + area = cv2.contourArea(points) + if area>0: + y1, y2, x1, x2 =[np.min(points[:,0]),np.max(points[:,0]),np.min(points[:,1]),np.max(points[:,1])] crop = slide.read_region((y1,x1),0,(y2-y1,x2-x1)) crop = np.array(crop) crop = crop[:,:,:3] - mask = (glomeruli[x1:x2,y1:y2] == i+1).astype(np.uint8) + mask = np.zeros((x2-x1,y2-y1),dtype=np.uint8) + points[:,0]-=y1 + points[:,1]-=x1 + mask=cv2.fillPoly(mask,[points],1) hsv = rgb2hsv(crop) hsv = hsv[:,:,1] @@ -51,41 +46,24 @@ def process_glom_features(mask_xml, glom_value, MOD, slide, mpp, h_threshold, sa mask_pixels = np.sum(mask) pas_pixels = np.sum(pas_seg) - mes_fraction = (pas_pixels*(mpp**2)) / area - - gloms[i,0] = x1 - gloms[i,1] = x2 - gloms[i,2] = y1 - gloms[i,3] = y2 - gloms[i,4] = area - gloms[i,5] = pas_pixels*(mpp**2) - gloms[i,6] = mes_fraction - - del glomeruli - - return gloms + mes_fraction = (pas_pixels) / area -def process_tubules_features(mask_xml, tub_value, MOD, slide, mpp, whitespace_threshold): + return [x1,x2,y1,y2, area, pas_pixels, mes_fraction] - tubules = mask_xml == tub_value - tubules = tubules.astype(np.uint8) - tubules = measure.label(tubules) - tubules_unique_max = np.max(tubules) - tubs = np.zeros((tubules_unique_max,7)) - props = measure.regionprops(tubules) - - for i in tqdm(range(tubules_unique_max),desc='Tubules'): - #Area, Cellularity, Mesangial Area to Cellularity Ratio - # i=2615 - area = (props[i].area)*(mpp**2) - x1,y1,x2,y2 = props[i].bbox +def process_tubules_features(points, MOD, slide, whitespace_threshold): + area = cv2.contourArea(points) + if area>0: + y1, y2, x1, x2 =[np.min(points[:,0]),np.max(points[:,0]),np.min(points[:,1]),np.max(points[:,1])] crop = slide.read_region((y1,x1),0,(y2-y1,x2-x1)) crop = np.array(crop) crop = crop[:,:,:3] - mask = (tubules[x1:x2,y1:y2] == i+1).astype(np.uint8) + mask = np.zeros((x2-x1,y2-y1),dtype=np.uint8) + points[:,0]-=y1 + points[:,1]-=x1 + mask=cv2.fillPoly(mask,[points],1) lab = rgb2lab(crop) lab = lab[:,:,0] @@ -156,38 +134,12 @@ def process_tubules_features(mask_xml, tub_value, MOD, slide, mpp, whitespace_th area_tot = np.sum(np.array(cyto_areas)) cyto_avg = np.sum(np.multiply(np.array(cyto_thicknesses),(np.array(cyto_areas)/area_tot))) - tubs[i,0] = x1 - tubs[i,1] = x2 - tubs[i,2] = y1 - tubs[i,3] = y2 - tubs[i,4] = tbm_avg*(mpp**2) - tubs[i,5] = cyto_avg*(mpp**2) - tubs[i,6] = np.sum(WS) / np.sum(mask)# - - del tubules - - return tubs - - -def process_arteriol_features(mask_xml, art_value, mpp): - - arteries = mask_xml == art_value - arteries = arteries.astype(np.uint8) - arteries = measure.label(arteries) - arts_unique_max = np.max(arteries) - arts = np.zeros((arts_unique_max,5)) - props = measure.regionprops(arteries) - - for i in tqdm(range(arts_unique_max),desc='Arteries'): - area = (props[i].area)*(mpp**2) - x1,y1,x2,y2 = props[i].bbox + return [x1,x2,y1,y2,tbm_avg,cyto_avg,np.sum(WS) / np.sum(mask)] - arts[i,0] = x1 - arts[i,1] = x2 - arts[i,2] = y1 - arts[i,3] = y2 - arts[i,4] = area - del arteries +def process_arteriol_features(points): - return arts + area = cv2.contourArea(points) + if area>0: + y1, y2, x1, x2 =[np.min(points[:,0]),np.max(points[:,0]),np.min(points[:,1]),np.max(points[:,1])] + return [x1,x2,y1,y2,area] diff --git a/multic/segmentationschool/extraction_utils/run_pathomic_fe.py b/multic/segmentationschool/extraction_utils/run_pathomic_fe.py new file mode 100644 index 0000000..f453c53 --- /dev/null +++ b/multic/segmentationschool/extraction_utils/run_pathomic_fe.py @@ -0,0 +1,175 @@ +import sys +import os, girder_client +import numpy as np +from tqdm import tqdm +import pandas as pd +from tiffslide import TiffSlide +import lxml.etree as ET +import multiprocessing +from joblib import Parallel, delayed + +MODx=np.zeros((3,)) +MODy=np.zeros((3,)) +MODz=np.zeros((3,)) +MODx[0]= 0.644211 +MODy[0]= 0.716556 +MODz[0]= 0.266844 + +MODx[1]= 0.175411 +MODy[1]= 0.972178 +MODz[1]= 0.154589 + +MODx[2]= 0.0 +MODy[2]= 0.0 +MODz[2]= 0.0 +MOD=[MODx,MODy,MODz] + +from .extract_ffpe_features import xml_to_mask + +from .process_mc_features import process_glom_features, process_tubules_features, process_arteriol_features +from segmentationschool.Codes.xml_to_mask_minmax import write_minmax_to_xml +from segmentationschool.utils.json_to_xml import get_xml_path + +def getPathomicFeatures(args): + + cores=multiprocessing.cpu_count() + folder = args.base_dir + + # assert args.target is not None, 'Directory of xmls must be specified, use --target /path/to/files.xml' + # assert args.wsis is not None, 'Directory of WSIs must be specified, use --wsis /path/to/wsis' + if args.platform == 'DSA': + import girder_client + gc = girder_client.GirderClient(apiUrl=args.girderApiUrl) + gc.setToken(args.girderToken) + + file_name = args.file.split('/')[-1] + slide_item_id = args.item_id + output_dir = args.base_dir + slide_name,slideExt=file_name.split('.') + items=[(args.file, args.xml_path)] + + elif args.platform == 'HPG': + image_files = [image_name for _, _, files in os.walk(folder) for image_name in files if image_name.endswith('.xml')] + image_names = [os.path.join(folder, f.split('.')[0]) for f in image_files] + slideExt = args.ext + output_dir = args.output_dir + # each item in items is a tuple of (images, annotations) + items = [(f + slideExt, f + '.xml') for f in image_names] + else: + raise Exception("Please Enter a valid Platform, DSA or HPG") + + for i in range(len(items)): + + svsfile, xmlfile = items[i] + + print(xmlfile,'here') + write_minmax_to_xml(xmlfile) + slide = TiffSlide(svsfile) + + all_contours = {'1':[],'2':[],'3':[],'4':[],'5':[],'6':[]} + tree = ET.parse(xmlfile) + root = tree.getroot() + + xlsx_path = os.path.join(output_dir, os.path.basename(svsfile).split('.')[0] +'_pathomic'+'.xlsx') + for Annotation in root.findall("./Annotation"): # for all annotations + annotationID = Annotation.attrib['Id'] + if annotationID not in ['1','2','3','4','5','6']: + pass + else: + for Region in Annotation.findall("./*/Region"): # iterate on all region + verts=[] + for Vert in Region.findall("./Vertices/Vertex"): # iterate on all vertex in region + verts.append([int(float(Vert.attrib['X'])),int(float(Vert.attrib['Y']))]) + all_contours[annotationID].append(np.array(verts)) + + glom_features=Parallel(n_jobs=cores,prefer="threads")(delayed(process_glom_features)(points, + MOD, slide, h_threshold=args.h_threshold, saturation_threshold=args.saturation_threshold) for points in tqdm(all_contours['3'],colour='yellow',unit='Glomerulus',leave=False)) + s_glom_features=Parallel(n_jobs=cores,prefer="threads")(delayed(process_glom_features)(points, + MOD, slide, h_threshold=args.h_threshold, saturation_threshold=args.saturation_threshold) for points in tqdm(all_contours['4'],colour='yellow',unit='Scl. Glomerulus',leave=False)) + tub_features = Parallel(n_jobs=cores,prefer="threads")(delayed(process_tubules_features)(points, + MOD, slide,whitespace_threshold=args.whitespace_threshold) for points in tqdm(all_contours['5'],colour='blue',unit='Tubule',leave=False)) + art_features=Parallel(n_jobs=cores,prefer="threads")(delayed(process_arteriol_features)(points, + ) for points in tqdm(all_contours['6'], colour='magenta',unit='Artery(-iole)',leave=False)) + glom_features=[i for i in glom_features if i is not None] + s_glom_features=[i for i in s_glom_features if i is not None] + tub_features=[i for i in tub_features if i is not None] + art_features=[i for i in art_features if i is not None] + + def get_feature_matrix(features, columns): + m, n = len(features), len(columns) + feature_matrix = np.zeros((m, n)) + for i, feat in enumerate(features): + for j in range(n): + feature_matrix[i,j] = feat[j] + + return feature_matrix + # gloms = np.zeros((len(glom_features),7)) + # s_gloms = np.zeros((len(s_glom_features),7)) + # tubs = np.zeros((len(tub_features),7)) + # arts = np.zeros((len(art_features),5)) + + # for i, glom in enumerate(glom_features): + # gloms[i,0] = glom[0] + # gloms[i,1] = glom[1] + # gloms[i,2] = glom[2] + # gloms[i,3] = glom[3] + # gloms[i,4] = glom[4] + # gloms[i,5] = glom[5] + # gloms[i,6] = glom[6] + + # for i, glom in enumerate(s_glom_features): + # s_gloms[i,0] = s_glom[0] + # s_gloms[i,1] = s_glom[1] + # s_gloms[i,2] = s_glom[2] + # s_gloms[i,3] = s_glom[3] + # s_gloms[i,4] = s_glom[4] + # s_gloms[i,5] = s_glom[5] + # s_gloms[i,6] = s_glom[6] + + # for i, tub in enumerate(tub_features): + # if not tub: + # continue + # tubs[i,0] = tub[0] + # tubs[i,1] = tub[1] + # tubs[i,2] = tub[2] + # tubs[i,3] = tub[3] + # tubs[i,4] = tub[4] + # tubs[i,5] = tub[5] + # tubs[i,6] = tub[6] + + + # for i, art in enumerate(art_features): + # if not art: + # continue + # arts[i,0] = art[0] + # arts[i,1] = art[1] + # arts[i,2] = art[2] + # arts[i,3] = art[3] + # arts[i,4] = art[4] + + + + all_columns = [['x1','x2','y1','y2','Area','Mesangial Area','Mesangial Fraction'], + ['x1','x2','y1','y2','Area','Mesangial Area','Mesangial Fraction'], + ['x1','x2','y1','y2','Average TBM Thickness','Average Cell Thickness','Luminal Fraction'], + ['x1','x2','y1','y2','Arterial Area']] + compart_names = ['gloms','s_gloms','tubs','arts'] + + gloms = get_feature_matrix(glom_features, all_columns[0]) + s_gloms = get_feature_matrix(s_glom_features,all_columns[1]) + tubs = get_feature_matrix(tub_features, all_columns[2]) + arts = get_feature_matrix(art_features,all_columns[3]) + + all_comparts = [gloms,s_gloms,tubs, arts] + + _ = os.system("printf '\tWriting Excel file: [{}]\n'".format(xlsx_path)) + with pd.ExcelWriter(xlsx_path) as writer: + for idx,compart in enumerate(all_comparts): + df = pd.DataFrame(compart,columns=all_columns[idx]) + df.to_excel(writer, index=False, sheet_name=compart_names[idx]) + if args.platform == 'DSA': + gc.uploadFileToItem(file_id, xlsx_path, reference=None, mimeType=None, filename=None, progressCallback=None) + print('Girder file uploaded!') + + + diff --git a/multic/segmentationschool/segmentation_school.py b/multic/segmentationschool/segmentation_school.py index 50e32c1..d31b06a 100644 --- a/multic/segmentationschool/segmentation_school.py +++ b/multic/segmentationschool/segmentation_school.py @@ -47,7 +47,7 @@ def str2bool(v): def main(args): from segmentationschool.Codes.InitializeFolderStructure import initFolder, purge_training_set, prune_training_set - # from extract_reference_features import getKidneyReferenceFeatures,summarizeKidneyReferenceFeatures + from segmentationschool.Codes.extract_reference_features import getKidneyReferenceFeatures # from TransformXMLs import splice_cortex_XMLs,register_aperio_scn_xmls # from randomCropGenerator import randomCropGenerator if args.one_network == True: @@ -121,8 +121,8 @@ def savetime(args, starttime): help='girderApiUrl') parser.add_argument('--girderToken', dest='girderToken', default=' ' ,type=str, help='girderToken') - parser.add_argument('--files', dest='files', default=' ' ,type=str, - help='files') + parser.add_argument('--file', dest='file', default=' ' ,type=str, + help='input WSI file name') # option parser.add_argument('--option', dest='option', default=' ' ,type=str, help='option for [new, train, predict, validate]') @@ -134,8 +134,8 @@ def savetime(args, starttime): help='directory with xml transformation targets') parser.add_argument('--cortextarget', dest='cortextarget', default=None,type=str, help='directory with cortex annotations for splicing') - parser.add_argument('--output', dest='output', default=None,type=str, - help='directory to save output transformed XMLs') + parser.add_argument('--output_dir', dest='output_dir', default=None,type=str, + help='directory to save output excel file') parser.add_argument('--wsis', dest='wsis', default=None,type=str, help='directory of WSIs for reference feature extraction') parser.add_argument('--groupBy', dest='groupBy', default=None,type=str, @@ -165,7 +165,7 @@ def savetime(args, starttime): # automatically generated parser.add_argument('--base_dir', dest='base_dir', default=os.getcwd(),type=str, - help='base directory of code folder') + help='base directory of Data folder') parser.add_argument('--code_dir', dest='code_dir', default=os.getcwd(),type=str, help='base directory of code folder') @@ -280,6 +280,18 @@ def savetime(args, starttime): help='padded region for low resolution region extraction') parser.add_argument('--show_interstitium', dest='show_interstitium', default=True ,type=str2bool, help='padded region for low resolution region extraction') + + parser.add_argument('--xml_path', dest='xml_path', default=' ' ,type=str, + help='path to xml file') + + parser.add_argument('--ext', dest='ext', default='.svs' ,type=str, + help='file extention') + + parser.add_argument('--platform', dest='platform', default='DSA' ,type=str, + help='Run Platform, HPG or DSA') + + parser.add_argument('--item_id', dest='item_id', default=' ' , type=str, + help='item id of the WSI in DSA') diff --git a/multic/segmentationschool/utils/json_to_xml.py b/multic/segmentationschool/utils/json_to_xml.py new file mode 100644 index 0000000..fd3ff91 --- /dev/null +++ b/multic/segmentationschool/utils/json_to_xml.py @@ -0,0 +1,67 @@ +import os +from .mask_to_xml import xml_create, xml_add_annotation, xml_add_region, xml_save + +def get_xml_path(annot, NAMES, tmp, slidename): + + xmlAnnot = xml_create() + skipSlide = 0 + xml_color=[65280]*(len(NAMES)+1) + # all compartments + for class_,compart in enumerate(NAMES): + + compart = compart.replace(' ','') + class_ +=1 + # add layer to xml + xmlAnnot = xml_add_annotation(Annotations=xmlAnnot, xml_color=xml_color, annotationID=class_) + + # test all annotation layers in order created + for iter,a in enumerate(annot): + try: + # check for annotation layer by name + a_name = a['annotation']['name'].replace(' ','') + except: + a_name = None + + if a_name == compart: + # track all layers present + skipSlide +=1 + + pointsList = [] + + # load json data + _ = os.system("printf '\tloading annotation layer: [{}]\n'".format(compart)) + + a_data = a['annotation']['elements'] + + for data in a_data: + pointList = [] + points = data['points'] + for point in points: + pt_dict = {'X': round(point[0]), 'Y': round(point[1])} + pointList.append(pt_dict) + pointsList.append(pointList) + + # write annotations to xml + for i in range(len(pointsList)): + pointList = pointsList[i] + xmlAnnot = xml_add_region(Annotations=xmlAnnot, pointList=pointList, annotationID=class_) + + # print(a['_version'], a['updated'], a['created']) + break + + if skipSlide != len(NAMES): + _ = os.system("printf '\tThis slide is missing annotation layers\n'") + _ = os.system("printf '\tSKIPPING SLIDE...\n'") + + _ = os.system("printf '\tFETCHING SLIDE...\n'") + #os.rename('{}/{}'.format(folder, slidename), '{}/{}'.format(tmp, slidename)) + + xml_path = '{}/{}.xml'.format(tmp, os.path.splitext(slidename)[0]) + _ = os.system("printf '\tsaving a created xml annotation file: [{}]\n'".format(xml_path)) + xml_save(Annotations=xmlAnnot, filename=xml_path) + + del xmlAnnot + + + return xml_path + \ No newline at end of file diff --git a/multic/segmentationschool/utils/mask_to_xml.py b/multic/segmentationschool/utils/mask_to_xml.py new file mode 100644 index 0000000..34217e5 --- /dev/null +++ b/multic/segmentationschool/utils/mask_to_xml.py @@ -0,0 +1,135 @@ +import cv2 +import numpy as np +import lxml.etree as ET + +""" +xml_path (string) - the filename of the saved xml +mask (array) - the mask to convert to xml - uint8 array +downsample (int) - amount of downsampling done to the mask + points are upsampled - this can be used to simplify the mask +min_size_thresh (int) - the minimum objectr size allowed in the mask. This is referenced from downsample=1 +xml_color (list) - list of binary color values to be used for classes + +""" + +def mask_to_xml(xml_path, mask, downsample=1, min_size_thresh=0, simplify_contours=0, xml_color=[65280, 65535, 33023, 255, 16711680], verbose=0, return_root=False, maxClass=None, offset={'X': 0,'Y': 0}): + + min_size_thresh /= downsample + + # create xml tree + Annotations = xml_create() + + # get all classes + classes = np.unique(mask) + if maxClass is None: + maxClass = max(classes) + + # add annotation classes to tree + for class_ in range(maxClass+1)[1:]: + if verbose: + print('Creating class: [{}]'.format(class_)) + Annotations = xml_add_annotation(Annotations=Annotations, xml_color=xml_color, annotationID=class_) + + # add contour points to tree classwise + for class_ in classes: # iterate through all classes + + if class_ == 0 or class_ > maxClass: + continue + + if verbose: + print('Working on class [{} of {}]'.format(class_, max(classes))) + + # binarize the mask w.r.t. class_ + binaryMask = mask==class_ + + # get contour points of the mask + pointsList = get_contour_points(binaryMask, downsample=downsample, min_size_thresh=min_size_thresh, simplify_contours=simplify_contours, offset=offset) + for i in range(np.shape(pointsList)[0]): + pointList = pointsList[i] + Annotations = xml_add_region(Annotations=Annotations, pointList=pointList, annotationID=class_) + + if return_root: + # return root, do not save xml file + return Annotations + + # save the final xml file + xml_save(Annotations=Annotations, filename='{}.xml'.format(xml_path.split('.')[0])) + + +def get_contour_points(mask, downsample, min_size_thresh=0, simplify_contours=0, offset={'X': 0,'Y': 0}): + # returns a dict pointList with point 'X' and 'Y' values + # input greyscale binary image + #_, maskPoints, contours = cv2.findContours(np.array(mask), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_KCOS) + maskPoints, contours = cv2.findContours(np.uint8(mask), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_L1) + maskPoints = list(maskPoints) + # remove small regions + too_small = [] + for idx, cnt in enumerate(maskPoints): + area = cv2.contourArea(cnt) + if area < min_size_thresh: + too_small.append(idx) + if too_small != []: + too_small.reverse() + for idx in too_small: + maskPoints.pop(idx) + + if simplify_contours > 0: + for idx, cnt in enumerate(maskPoints): + epsilon = simplify_contours*cv2.arcLength(cnt,True) + approx = cv2.approxPolyDP(cnt,epsilon,True) + maskPoints[idx] = approx + + pointsList = [] + for j in range(np.shape(maskPoints)[0]): + pointList = [] + for i in range(0,np.shape(maskPoints[j])[0]): + point = {'X': (maskPoints[j][i][0][0] * downsample) + offset['X'], 'Y': (maskPoints[j][i][0][1] * downsample) + offset['Y']} + pointList.append(point) + pointsList.append(pointList) + return pointsList + +### functions for building an xml tree of annotations ### +def xml_create(): # create new xml tree + # create new xml Tree - Annotations + Annotations = ET.Element('Annotations') + return Annotations + +def xml_add_annotation(Annotations, xml_color, annotationID=None): # add new annotation + # add new Annotation to Annotations + # defualts to new annotationID + if annotationID == None: # not specified + annotationID = len(Annotations.findall('Annotation')) + 1 + Annotation = ET.SubElement(Annotations, 'Annotation', attrib={'Type': '4', 'Visible': '1', 'ReadOnly': '0', 'Incremental': '0', 'LineColorReadOnly': '0', 'LineColor': str(xml_color[annotationID-1]), 'Id': str(annotationID), 'NameReadOnly': '0'}) + Regions = ET.SubElement(Annotation, 'Regions') + return Annotations + +def xml_add_region(Annotations, pointList, annotationID=-1, regionID=None): # add new region to annotation + # add new Region to Annotation + # defualts to last annotationID and new regionID + Annotation = Annotations.find("Annotation[@Id='" + str(annotationID) + "']") + Regions = Annotation.find('Regions') + if regionID == None: # not specified + regionID = len(Regions.findall('Region')) + 1 + Region = ET.SubElement(Regions, 'Region', attrib={'NegativeROA': '0', 'ImageFocus': '-1', 'DisplayId': '1', 'InputRegionId': '0', 'Analyze': '0', 'Type': '0', 'Id': str(regionID)}) + Vertices = ET.SubElement(Region, 'Vertices') + for point in pointList: # add new Vertex + ET.SubElement(Vertices, 'Vertex', attrib={'X': str(point['X']), 'Y': str(point['Y']), 'Z': '0'}) + # add connecting point + ET.SubElement(Vertices, 'Vertex', attrib={'X': str(pointList[0]['X']), 'Y': str(pointList[0]['Y']), 'Z': '0'}) + return Annotations + +def xml_save(Annotations, filename): + xml_data = ET.tostring(Annotations, pretty_print=True) + #xml_data = Annotations.toprettyxml() + f = open(filename, 'w') + f.write(xml_data.decode()) + f.close() + +def read_xml(filename): + # import xml file + tree = ET.parse(filename) + root = tree.getroot() + +if __name__ == '__main__': + main() + \ No newline at end of file diff --git a/setup.py b/setup.py index fff61e6..b89b3c7 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ def prerelease_local_scheme(version): 'girder-client', # cli 'ctk-cli', + 'XlsxWriter' ], license='Apache Software License 2.0', keywords='multic',