diff --git a/.gitattributes b/.gitattributes index 7f4d5667..3e0ca4ab 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,4 +10,3 @@ mafft binary .gitattributes export-ignore .gitignore export-ignore .github export-ignore -*.mat filter=lfs diff=lfs merge=lfs -text diff --git a/doc/external/kegg/getGenesFromKEGG.html b/doc/external/kegg/getGenesFromKEGG.html index 2035078a..cba93c72 100644 --- a/doc/external/kegg/getGenesFromKEGG.html +++ b/doc/external/kegg/getGenesFromKEGG.html @@ -102,7 +102,7 @@


0001 function model=getGenesFromKEGG(keggPath,koList) @@ -178,273 +178,339 @@SOURCE CODE
'external','kegg','keggGenes.mat'); -0074 if exist(genesFile, 'file') -0075 fprintf(['Importing KEGG genes from ' strrep(genesFile,'\','/') '... ']); -0076 load(genesFile); -0077 else -0078 fprintf(['NOTE: Cannot locate ' strrep(genesFile,'\','/') ', it will therefore be generated from the local KEGG database\n']); -0079 if ~isfile(fullfile(keggPath,'ko')) || ~isfile(fullfile(keggPath,'reaction')) -0080 EM=fprintf(['The files ''ko'' and ''reaction'' cannot be located at ' strrep(keggPath,'\','/') '/ and should be downloaded from the KEGG FTP\n']); -0081 dispEM(EM); -0082 else -0083 fprintf('Generating keggGenes.mat file...\n'); -0084 %Get all KOs that are associated to reactions -0085 allKOs=getAllKOs(keggPath); -0086 -0087 %Since the list of genes will be accessed many times the hash table -0088 %is established -0089 geneMap=containers.Map('KeyType','char','ValueType','int32'); -0090 -0091 %Add new functionality in the order specified in models -0092 model.id='KEGG'; -0093 model.name='Automatically generated from KEGG database'; -0094 -0095 %Preallocate memory -0096 model.rxns=cell(numel(allKOs),1); -0097 model.rxnNames=cell(numel(allKOs),1); -0098 -0099 %Reserve space for 500000 genes -0100 model.genes=cell(500000,1); +0074 if ~exist(genesFile, 'file') +0075 try +0076 downloadKEGGgenes(); +0077 catch +0078 %Download failed; fall through to local regeneration below +0079 end +0080 end +0081 if exist(genesFile, 'file') +0082 fprintf(['Importing KEGG genes from ' strrep(genesFile,'\','/') '... ']); +0083 load(genesFile); +0084 else +0085 fprintf(['NOTE: Cannot locate ' strrep(genesFile,'\','/') ', it will therefore be generated from the local KEGG database\n']); +0086 if ~isfile(fullfile(keggPath,'ko')) || ~isfile(fullfile(keggPath,'reaction')) +0087 EM=fprintf(['The files ''ko'' and ''reaction'' cannot be located at ' strrep(keggPath,'\','/') '/ and should be downloaded from the KEGG FTP\n']); +0088 dispEM(EM); +0089 else +0090 fprintf('Generating keggGenes.mat file...\n'); +0091 %Get all KOs that are associated to reactions +0092 allKOs=getAllKOs(keggPath); +0093 +0094 %Since the list of genes will be accessed many times the hash table +0095 %is established +0096 geneMap=containers.Map('KeyType','char','ValueType','int32'); +0097 +0098 %Add new functionality in the order specified in models +0099 model.id='KEGG'; +0100 model.name='Automatically generated from KEGG database'; 0101 -0102 %Load information on KO ID, name, and associated genes -0103 fid = fopen(fullfile(keggPath,'ko'), 'r'); -0104 -0105 %Keeps track of how many KOs that have been added -0106 koCounter=0; -0107 nGenes=0; -0108 addingGenes=false; -0109 -0110 %These contain the mapping between KOs and genes and are used to -0111 %generate the model.rxnGeneMat in the end -0112 koIndex=zeros(5000000,1); -0113 geneIndex=koIndex; -0114 nMappings=0; -0115 -0116 skipRead=false; -0117 -0118 %Loop through the file -0119 while 1 -0120 %Get the next line -0121 if skipRead==false -0122 tline = fgetl(fid); -0123 else -0124 skipRead=false; -0125 end -0126 -0127 %Abort at end of file -0128 if ~ischar(tline) -0129 break; -0130 end -0131 -0132 %Skip '///' -0133 if numel(tline)<12 -0134 addingGenes=false; -0135 continue; -0136 end -0137 -0138 %Check if it's a new reaction -0139 if strcmp(tline(1:12),'ENTRY ') -0140 %Check if it should be added -0141 koID=tline(13:18); -0142 -0143 if ismember(koID,allKOs) -0144 addMe=true; -0145 koCounter=koCounter+1; -0146 else -0147 addMe=false; -0148 continue; -0149 end -0150 -0151 %Add reaction ID (always 6 characters) -0152 model.rxns{koCounter}=koID; -0153 -0154 model.rxnNames{koCounter}=''; -0155 %Will be overwritten if it exists -0156 end -0157 -0158 %To get here we must be in a KO that should be added -0159 if addMe==true -0160 %Add name -0161 if strcmp(tline(1:12),'DEFINITION ') -0162 model.rxnNames{koCounter}=tline(13:end); -0163 end -0164 -0165 %Check if it's time to start adding genes -0166 if strcmp(tline(1:12),'GENES ') -0167 addingGenes=true; -0168 end -0169 -0170 %Add genes -0171 if addingGenes==true -0172 %We are now adding genes for the current KO. All gene -0173 %rows start with 12 spaces. If this is not true it -0174 %means that there is additional annotation such as -0175 %AUTHORS. This is not common but it exists -0176 if strcmp(tline(1:12),' ') || strcmp(tline(1:12),'GENES ') -0177 geneLine=tline(13:end); -0178 -0179 %Check if the line is from a new organism of from -0180 %the same as before -0181 if strcmp(geneLine(4),':') -0182 %If organism id contains 3 letters; -0183 currentOrganism=geneLine(1:3); -0184 %Parse the string -0185 genes=regexp(geneLine(6:end),' ','split'); -0186 genes=strcat(currentOrganism,':',genes(:)); -0187 elseif strcmp(geneLine(5),':') -0188 %If organism id contains 4 letters; -0189 currentOrganism=geneLine(1:4); -0190 %Parse the string -0191 genes=regexp(geneLine(7:end),' ','split'); -0192 genes=strcat(currentOrganism,':',genes(:)); -0193 end -0194 -0195 %Add the genes to the gene list -0196 for i=1:numel(genes) -0197 geneString=genes{i}; -0198 if geneMap.isKey(geneString) -0199 index=geneMap(geneString); -0200 else -0201 nGenes=nGenes+1; -0202 model.genes{nGenes}=geneString; -0203 index=nGenes; -0204 geneMap(geneString)=index; -0205 end -0206 -0207 %Add the connection to the KO -0208 nMappings=nMappings+1; -0209 koIndex(nMappings)=koCounter; -0210 geneIndex(nMappings)=index; -0211 end -0212 else -0213 %Now we want to throw away everything until the -0214 %next entry -0215 while 1 -0216 tline = fgetl(fid); -0217 if strcmp(tline,'///') -0218 %When the new entry is found, skip reading -0219 %one line to fit with the rest of the code -0220 skipRead=true; -0221 break; -0222 end -0223 end -0224 end -0225 end -0226 end -0227 end -0228 %Close the file -0229 fclose(fid); -0230 -0231 %If too much space was allocated, shrink the model -0232 model.rxns=model.rxns(1:koCounter); -0233 model.rxnNames=model.rxnNames(1:koCounter); -0234 model.genes=model.genes(1:nGenes); -0235 -0236 %Retrieve and add the gene associations -0237 model.rxnGeneMat=sparse(koIndex(1:nMappings),geneIndex(1:nMappings),ones(nMappings,1)); -0238 -0239 %To make sure the size is correct if the last KOs don't have genes -0240 if size(model.rxnGeneMat,1)~=koCounter -0241 model.rxnGeneMat(koCounter,1)=0; -0242 end -0243 -0244 %Trim the genes so that they only contain information that can be -0245 %matched to the KEGG file of protein sequences (remove all -0246 %information after first parenthesis) -0247 %NOTE: For some reason the organism abbreviation should be in lower -0248 %case in this database. Fix this here -0249 for i=1:numel(model.genes) -0250 parIndex=strfind(model.genes{i},'('); -0251 if any(parIndex) -0252 model.genes{i}=model.genes{i}(1:parIndex-1); -0253 end -0254 colIndex=strfind(model.genes{i},':'); -0255 model.genes{i}=[lower(model.genes{i}(1:colIndex-1)) model.genes{i}(colIndex:end)]; -0256 end -0257 -0258 %Save the model structure -0259 save(genesFile,'model'); -0260 end -0261 end -0262 -0263 %Only get the KOs in koList -0264 if nargin>1 -0265 I=~ismember(model.rxns,koList); -0266 model=removeReactions(model,I,true,true); -0267 end -0268 fprintf('COMPLETE\n'); -0269 end -0270 -0271 function allKOs=getAllKOs(keggPath) -0272 %Retrieves all KOs that are associated to reactions. This is because the -0273 %number of genes in KEGG is very large so without this parsing it would -0274 %take many hours -0275 -0276 allKOs={}; +0102 %Preallocate memory +0103 model.rxns=cell(numel(allKOs),1); +0104 model.rxnNames=cell(numel(allKOs),1); +0105 +0106 %Reserve space for 500000 genes +0107 model.genes=cell(500000,1); +0108 +0109 %Load information on KO ID, name, and associated genes +0110 fid = fopen(fullfile(keggPath,'ko'), 'r'); +0111 +0112 %Keeps track of how many KOs that have been added +0113 koCounter=0; +0114 nGenes=0; +0115 addingGenes=false; +0116 +0117 %These contain the mapping between KOs and genes and are used to +0118 %generate the model.rxnGeneMat in the end +0119 koIndex=zeros(5000000,1); +0120 geneIndex=koIndex; +0121 nMappings=0; +0122 +0123 skipRead=false; +0124 +0125 %Loop through the file +0126 while 1 +0127 %Get the next line +0128 if skipRead==false +0129 tline = fgetl(fid); +0130 else +0131 skipRead=false; +0132 end +0133 +0134 %Abort at end of file +0135 if ~ischar(tline) +0136 break; +0137 end +0138 +0139 %Skip '///' +0140 if numel(tline)<12 +0141 addingGenes=false; +0142 continue; +0143 end +0144 +0145 %Check if it's a new reaction +0146 if strcmp(tline(1:12),'ENTRY ') +0147 %Check if it should be added +0148 koID=tline(13:18); +0149 +0150 if ismember(koID,allKOs) +0151 addMe=true; +0152 koCounter=koCounter+1; +0153 else +0154 addMe=false; +0155 continue; +0156 end +0157 +0158 %Add reaction ID (always 6 characters) +0159 model.rxns{koCounter}=koID; +0160 +0161 model.rxnNames{koCounter}=''; +0162 %Will be overwritten if it exists +0163 end +0164 +0165 %To get here we must be in a KO that should be added +0166 if addMe==true +0167 %Add name +0168 if strcmp(tline(1:12),'DEFINITION ') +0169 model.rxnNames{koCounter}=tline(13:end); +0170 end +0171 +0172 %Check if it's time to start adding genes +0173 if strcmp(tline(1:12),'GENES ') +0174 addingGenes=true; +0175 end +0176 +0177 %Add genes +0178 if addingGenes==true +0179 %We are now adding genes for the current KO. All gene +0180 %rows start with 12 spaces. If this is not true it +0181 %means that there is additional annotation such as +0182 %AUTHORS. This is not common but it exists +0183 if strcmp(tline(1:12),' ') || strcmp(tline(1:12),'GENES ') +0184 geneLine=tline(13:end); +0185 +0186 %Check if the line is from a new organism of from +0187 %the same as before +0188 if strcmp(geneLine(4),':') +0189 %If organism id contains 3 letters; +0190 currentOrganism=geneLine(1:3); +0191 %Parse the string +0192 genes=regexp(geneLine(6:end),' ','split'); +0193 genes=strcat(currentOrganism,':',genes(:)); +0194 elseif strcmp(geneLine(5),':') +0195 %If organism id contains 4 letters; +0196 currentOrganism=geneLine(1:4); +0197 %Parse the string +0198 genes=regexp(geneLine(7:end),' ','split'); +0199 genes=strcat(currentOrganism,':',genes(:)); +0200 end +0201 +0202 %Add the genes to the gene list +0203 for i=1:numel(genes) +0204 geneString=genes{i}; +0205 if geneMap.isKey(geneString) +0206 index=geneMap(geneString); +0207 else +0208 nGenes=nGenes+1; +0209 model.genes{nGenes}=geneString; +0210 index=nGenes; +0211 geneMap(geneString)=index; +0212 end +0213 +0214 %Add the connection to the KO +0215 nMappings=nMappings+1; +0216 koIndex(nMappings)=koCounter; +0217 geneIndex(nMappings)=index; +0218 end +0219 else +0220 %Now we want to throw away everything until the +0221 %next entry +0222 while 1 +0223 tline = fgetl(fid); +0224 if strcmp(tline,'///') +0225 %When the new entry is found, skip reading +0226 %one line to fit with the rest of the code +0227 skipRead=true; +0228 break; +0229 end +0230 end +0231 end +0232 end +0233 end +0234 end +0235 %Close the file +0236 fclose(fid); +0237 +0238 %If too much space was allocated, shrink the model +0239 model.rxns=model.rxns(1:koCounter); +0240 model.rxnNames=model.rxnNames(1:koCounter); +0241 model.genes=model.genes(1:nGenes); +0242 +0243 %Retrieve and add the gene associations +0244 model.rxnGeneMat=sparse(koIndex(1:nMappings),geneIndex(1:nMappings),ones(nMappings,1)); +0245 +0246 %To make sure the size is correct if the last KOs don't have genes +0247 if size(model.rxnGeneMat,1)~=koCounter +0248 model.rxnGeneMat(koCounter,1)=0; +0249 end +0250 +0251 %Trim the genes so that they only contain information that can be +0252 %matched to the KEGG file of protein sequences (remove all +0253 %information after first parenthesis) +0254 %NOTE: For some reason the organism abbreviation should be in lower +0255 %case in this database. Fix this here +0256 for i=1:numel(model.genes) +0257 parIndex=strfind(model.genes{i},'('); +0258 if any(parIndex) +0259 model.genes{i}=model.genes{i}(1:parIndex-1); +0260 end +0261 colIndex=strfind(model.genes{i},':'); +0262 model.genes{i}=[lower(model.genes{i}(1:colIndex-1)) model.genes{i}(colIndex:end)]; +0263 end +0264 +0265 %Save the model structure +0266 save(genesFile,'model'); +0267 end +0268 end +0269 +0270 %Only get the KOs in koList +0271 if nargin>1 +0272 I=~ismember(model.rxns,koList); +0273 model=removeReactions(model,I,true,true); +0274 end +0275 fprintf('COMPLETE\n'); +0276 end 0277 -0278 %First check if the reactions have already been parsed -0279 ravenPath=findRAVENroot; -0280 rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat'); -0281 if exist(rxnsFile, 'file') -0282 fprintf(['NOTE: Importing KEGG ORTHOLOGY list from ' strrep(rxnsFile,'\','/') '.\n']); -0283 load(rxnsFile,'model'); -0284 -0285 %Loop through the reactions and add the corresponding genes -0286 for i=1:numel(model.rxns) -0287 if isstruct(model.rxnMiriams{i}) -0288 %Get all KOs -0289 allKOs=[allKOs;model.rxnMiriams{i}.value(strcmpi(model.rxnMiriams{i}.name,'kegg.orthology'))]; -0290 end -0291 end -0292 else -0293 %Parse the reaction file instead First load information on reaction ID, -0294 %reaction name, KO, pathway and ec-number -0295 fid = fopen(fullfile(keggPath,'reaction'), 'r'); -0296 orthology=false; -0297 while 1 -0298 %Get the next line -0299 tline = fgetl(fid); -0300 -0301 %Abort at end of file -0302 if ~ischar(tline) -0303 break; -0304 end -0305 -0306 %Skip '///' -0307 if numel(tline)<12 -0308 continue; -0309 end -0310 -0311 %Check if it's a new reaction -0312 if strcmp(tline(1:12),'ENTRY ') -0313 orthology=false; -0314 end -0315 -0316 if strcmp(tline(1:9),'REFERENCE') -0317 orthology=false; -0318 end -0319 -0320 %Add KO-ids -0321 if numel(tline)>16 -0322 if strcmp(tline(1:16),'ORTHOLOGY KO: ') || strcmp(tline(1:16),' KO: ') || strcmp(tline(1:12),'ORTHOLOGY ') || orthology==true -0323 if strcmp(tline(13:16),'KO:') -0324 %This is in the old version -0325 allKOs=[allKOs;tline(17:22)]; -0326 elseif strcmp(tline(1:12),'ORTHOLOGY ') -0327 %This means that it found one KO in the new format and -0328 %that subsequent lines might be other KOs -0329 orthology=true; -0330 allKOs=[allKOs;tline(13:18)]; -0331 end -0332 end -0333 end -0334 end -0335 -0336 %Close the file -0337 fclose(fid); -0338 end -0339 allKOs=unique(allKOs); -0340 end