001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on December 19, 2013 021 * Author: Douglas Myers-Turnbull 022 */ 023 024package org.biojava.nbio.structure; 025 026import java.io.IOException; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.LinkedList; 030import java.util.List; 031 032import org.biojava.nbio.structure.align.util.AtomCache; 033import org.biojava.nbio.structure.contact.Grid; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036/** 037 * This is the canonical way to identify a part of a structure. 038 * 039 * <p>The current syntax allows the specification of a set of residues from 040 * the first model of a structure. Future versions may be extended to represent 041 * additional properties. 042 * 043 * <p>Identifiers should adhere to the following specification, although some 044 * additional forms may be tolerated where unambiguous for backwards compatibility. 045 * <pre> 046 * name := pdbID 047 * | pdbID '.' chainID 048 * | pdbID '.' range 049 * range := range (',' range)? 050 * | chainID 051 * | chainID '_' resNum '-' resNum 052 * pdbID := [1-9][a-zA-Z0-9]{3} 053 * | PDB_[a-zA-Z0-9]{8} 054 * chainID := [a-zA-Z0-9]+ 055 * resNum := [-+]?[0-9]+[A-Za-z]? 056 * </pre> 057 * For example: 058 * <pre> 059 * 1TIM #whole structure (short format) 060 * 1tim #same as above 061 * 4HHB.C #single chain 062 * 3AA0.A,B #two chains 063 * 4GCR.A_1-40 #substructure 064 * 3iek.A_17-28,A_56-294,A_320-377 #substructure of 3 disjoint parts 065 * PDB_00001TIM #whole structure (extended format) 066 * pdb_00001tim #same as above 067 * PDB_00004HHB.C #single chain 068 * PDB_00003AA0.A,B #two chains 069 * PDB_00004GCR.A_1-40 #substructure 070 * pdb_00003iek.A_17-28,A_56-294,A_320-377 #substructure of 3 disjoint parts 071 * </pre> 072 * More options may be added to the specification at a future time. 073 074 * @author dmyersturnbull 075 * @author Spencer Bliven 076 */ 077public class SubstructureIdentifier implements StructureIdentifier { 078 079 private static final long serialVersionUID = 1L; 080 081 private static final Logger logger = LoggerFactory.getLogger(SubstructureIdentifier.class); 082 083 084 private final PdbId pdbId; 085 private final List<ResidueRange> ranges; 086 087 /** 088 * Create a new identifier from a string. 089 * @param id 090 */ 091 public SubstructureIdentifier(String id) { 092 String[] idRange = id.split("\\."); 093 if(1 > idRange.length || idRange.length > 2 ) { 094 throw new IllegalArgumentException(String.format("Malformed %s: %s",getClass().getSimpleName(),id)); 095 } 096 //used tempId to avoid writing 2 assignment statements to a final field, 097 // although one is in the try block and the other in the catch block. 098 PdbId tempId = null; 099 try { 100 tempId = new PdbId(idRange[0]); 101 } catch (IllegalArgumentException e) { 102 // Changed from Exception to a warning to support files and stuff -sbliven 2015/01/22 103 logger.warn(String.format("Unrecognized PDB code %s", idRange[0])); 104 } 105 this.pdbId = tempId; 106 107 if( idRange.length == 2) { 108 String rangeStr = idRange[1].trim(); 109 110 this.ranges = ResidueRange.parseMultiple(rangeStr); 111 } else { 112 this.ranges = new LinkedList<ResidueRange>(); 113 } 114 } 115 116 /** 117 * Create a new identifier based on a set of ranges. 118 * 119 * If ranges is empty, includes all residues. 120 * @param pdbId 121 * @param ranges 122 */ 123 public SubstructureIdentifier(String pdbId, List<ResidueRange> ranges) { 124 this(new PdbId(pdbId), ranges); 125 } 126 127 /** 128 * Create a new identifier based on a set of ranges. 129 * 130 * If ranges is empty, includes all residues. 131 * @param pdbId 132 * @param ranges 133 */ 134 public SubstructureIdentifier(PdbId pdbId, List<ResidueRange> ranges) { 135 if(ranges == null) { 136 throw new NullPointerException("Null ranges list"); 137 } 138 this.pdbId = pdbId; 139 this.ranges = ranges; 140 } 141 142 @Override 143 public String toString() { 144 return getIdentifier(); 145 } 146 147 /** 148 * Get the String form of this identifier. 149 * 150 * This provides the canonical form for a StructureIdentifier and has 151 * all the information needed to recreate a particular substructure. 152 * 153 * Example: 3iek.A_17-28,A_56-294 154 * @return The String form of this identifier 155 */ 156 @Override 157 public String getIdentifier() { 158 String pdbId = this.pdbId == null? "": this.pdbId.getId(); 159 if (ranges.isEmpty()) return pdbId; 160 return pdbId + "." + ResidueRange.toString(ranges); 161 } 162 163 /** 164 * Get the PDB identifier part of the SubstructureIdentifier 165 * @return the PDB ID 166 */ 167 public PdbId getPdbId() { 168 return pdbId; 169 } 170 171 public List<ResidueRange> getResidueRanges() { 172 return ranges; 173 } 174 175 /** 176 * Return itself. SubstructureIdentifiers are canonical! 177 */ 178 @Override 179 public SubstructureIdentifier toCanonical() { 180 return this; 181 } 182 183 /** 184 * Takes a complete structure as input and reduces it to residues present in 185 * the specified ranges 186 * 187 * <p>The returned structure will be a shallow copy of the input, with shared 188 * Chains, Residues, etc. 189 * 190 * <p>Ligands are handled in a special way. If a full chain is selected 191 * (e.g. '1ABC.A') then any waters and ligands with matching chain name are 192 * included. If a residue range is present ('1ABC.A:1-100') then any 193 * ligands (technically non-water non-polymer atoms) within 194 * {@link StructureTools#DEFAULT_LIGAND_PROXIMITY_CUTOFF} of the selected 195 * range are included, regardless of chain. 196 * @param input A full structure, e.g. as loaded from the PDB. The structure 197 * ID should match that returned by getPdbId(). 198 * @return 199 * @throws StructureException 200 * @see StructureTools#getReducedStructure(Structure, String) 201 */ 202 @Override 203 public Structure reduce(Structure s) throws StructureException { 204 // Follows StructureImpl.clone() 205 206 if(s == null) 207 throw new StructureException("NullPointerException Possibly due to malformed PIBId format."); 208 209 // Create new structure & copy basic properties 210 Structure newS = new StructureImpl(); 211 212 newS.setPdbId(s.getPdbId()); 213 newS.setPDBHeader(s.getPDBHeader()); 214 newS.setName(this.toString()); 215 newS.setDBRefs(s.getDBRefs()); 216 newS.setBiologicalAssembly(s.isBiologicalAssembly()); 217 newS.getPDBHeader().setDescription( 218 "sub-range " + ranges + " of " + newS.getPdbId() + " " 219 + s.getPDBHeader().getDescription()); 220 newS.setEntityInfos(new ArrayList<>()); 221 // TODO The following should be only copied for atoms which are present in the range. 222 newS.setSSBonds(s.getSSBonds()); 223 newS.setSites(s.getSites()); 224 225 newS.setStructureIdentifier(this); 226 227 for( int modelNr=0;modelNr<s.nrModels();modelNr++) { 228 229 // Construct new model 230 newS.addModel(new ArrayList<Chain>()); 231 232 if(getResidueRanges().isEmpty()) { 233 // Include all residues 234 newS.setEntityInfos(s.getEntityInfos()); 235 newS.setSSBonds(s.getSSBonds()); 236 newS.setSites(s.getSites()); 237 238 newS.setModel(modelNr, s.getModel(modelNr)); 239 } else { 240 // Restrict residues 241 for( ResidueRange range: getResidueRanges()) { 242 243 String chainName = range.getChainName(); 244 ResidueNumber pdbresnum1 = range.getStart(); 245 ResidueNumber pdbresnum2 = range.getEnd(); 246 247// StructureTools.addGroupsToStructure(newS, groups, modelNr, false); 248 Chain polyChain; //polymer 249 if(chainName.equals("_") ) { 250 // Handle special case of "_" chain for single-chain proteins 251 polyChain = s.getPolyChains(modelNr).get(0); 252 chainName = polyChain.getName(); 253 if(pdbresnum1 != null) 254 pdbresnum1.setChainName(chainName); 255 if(pdbresnum2 != null) 256 pdbresnum2.setChainName(chainName); 257 258 if(s.getPolyChains().size() != 1) { 259 // SCOP 1.71 uses this for some proteins with multiple chains 260 // Print a warning in this ambiguous case 261 logger.warn("Multiple possible chains match '_'. Using chain {}",chainName); 262 } 263 } else { 264 // Explicit chain 265 polyChain = s.getPolyChainByPDB(chainName,modelNr); 266 if( polyChain == null ) { 267 // Chain not found 268 // Maybe it was a chain index, masquerading as a chainName? 269 try { 270 int chainNum = Integer.parseInt(chainName); 271 polyChain = s.getChainByIndex(modelNr, chainNum); 272 chainName = polyChain.getName(); 273 if(pdbresnum1 != null) 274 pdbresnum1.setChainName(chainName); 275 if(pdbresnum2 != null) 276 pdbresnum2.setChainName(chainName); 277 logger.warn("No chain found for {}. Interpretting it as an index, using chain {} instead",chainName,polyChain.getId()); 278 } catch(NumberFormatException e3) { 279 // Not an index. Throw the original exception 280 throw new StructureException(String.format("Unrecognized chain %s in %s",chainName,getIdentifier())); 281 } 282 } 283 } 284 285 if(pdbresnum1 == null && pdbresnum2 == null) { 286 // Include all atoms with matching chainName 287 StructureTools.addGroupsToStructure(newS, polyChain.getAtomGroups(), modelNr, false); 288 for(Chain chain : s.getNonPolyChainsByPDB(chainName, modelNr) ) { 289 StructureTools.addGroupsToStructure(newS, chain.getAtomGroups(), modelNr, false); 290 } 291 Chain waters = s.getWaterChainByPDB(chainName, modelNr); 292 if( waters != null) { 293 StructureTools.addGroupsToStructure(newS, waters.getAtomGroups(), modelNr, false); 294 } 295 296 // TODO do we need to prune SeqRes down to the atoms present? -SB 2016-10-7 297 } else { 298 // Include polymer range and any proximal ligands 299 List<Group> polygroups = Arrays.asList(polyChain.getGroupsByPDB(pdbresnum1, pdbresnum2)); 300 StructureTools.addGroupsToStructure(newS, polygroups, modelNr, false); 301 copyLigandsByProximity(s,newS, StructureTools.DEFAULT_LIGAND_PROXIMITY_CUTOFF, modelNr, modelNr); 302 } 303 } // end range 304 } 305 } // end modelNr 306 307 return newS; 308 } 309 310 /** 311 * Loads the complete structure based on {@link #getPdbId()}. 312 * 313 * @param AtomCache A source of structures 314 * @return A Structure containing at least the atoms identified by this, 315 * or null if no PDB ID is set 316 * @throws StructureException For errors loading and parsing the structure 317 * @throws IOException Errors reading the structure from disk 318 */ 319 @Override 320 public Structure loadStructure(AtomCache cache) throws IOException, StructureException { 321 PdbId pdb = getPdbId(); 322 if(pdb == null) 323 return null; 324 return cache.getStructureForPdbId(pdb); 325 } 326 327 /** 328 * Supplements the reduced structure with ligands from the full structure based on 329 * a distance cutoff. Ligand groups are moved (destructively) from full to reduced 330 * if they fall within the cutoff of any atom in the reduced structure. 331 * The {@link StructureTools#DEFAULT_LIGAND_PROXIMITY_CUTOFF default cutoff} 332 * is used. 333 * @param full Structure containing all ligands 334 * @param reduced Structure with a subset of the polymer groups from full 335 * @see StructureTools#getLigandsByProximity(java.util.Collection, Atom[], double) 336 */ 337 protected static void copyLigandsByProximity(Structure full, Structure reduced) { 338 // Normal case where all models should be copied from full to reduced 339 assert full.nrModels() >= reduced.nrModels(); 340 for(int model = 0; model< reduced.nrModels(); model++) { 341 copyLigandsByProximity(full, reduced, StructureTools.DEFAULT_LIGAND_PROXIMITY_CUTOFF, model, model); 342 } 343 } 344 /** 345 * Supplements the reduced structure with ligands from the full structure based on 346 * a distance cutoff. Ligand groups are moved (destructively) from full to reduced 347 * if they fall within the cutoff of any atom in the reduced structure. 348 * @param full Structure containing all ligands 349 * @param reduced Structure with a subset of the polymer groups from full 350 * @param cutoff Distance cutoff (Å) 351 * @param fromModel source model in full 352 * @param toModel destination model in reduced 353 * @see StructureTools#getLigandsByProximity(java.util.Collection, Atom[], double) 354 */ 355 protected static void copyLigandsByProximity(Structure full, Structure reduced, double cutoff, int fromModel, int toModel) { 356 // Geometric hashing of the reduced structure 357 Grid grid = new Grid(cutoff); 358 Atom[] nonwaters = StructureTools.getAllNonHAtomArray(reduced,true,toModel); 359 if( nonwaters.length < 1 ) 360 return; 361 grid.addAtoms(nonwaters); 362 363 full.getNonPolyChains(fromModel).stream() //potential ligand chains 364 .flatMap((chain) -> chain.getAtomGroups().stream() ) // potential ligand groups 365 .filter( (g) -> !g.isWater() ) // ignore waters 366 .filter( (g) -> !g.isPolymeric() ) // already shouldn't be polymeric, but filter anyways 367 .filter( (g) -> grid.hasAnyContact(Calc.atomsToPoints(g.getAtoms())) ) // must contact reduced 368 .sequential() // Keeps ligands from the same chain together if possible 369 .reduce((Chain)null, // reduction updates the chain guess 370 (guess, g ) -> { 371 boolean wasAdded; 372 try { 373 // Check that it's not in reduced already 374 wasAdded = reduced.findGroup(g.getChainId(), 375 g.getResidueNumber().toString(), toModel) != null; 376 } catch (StructureException e) { 377 // not found 378 wasAdded = false; 379 } 380 if( !wasAdded ) { 381 // Add the ligand to reduced 382 // note this is not idempotent, but it is synchronized on reduced 383 logger.info("Adding ligand group {} {} by proximity",g.getPDBName(), g.getResidueNumber().toPDB()); 384 return StructureTools.addGroupToStructure(reduced, g, toModel, guess, false); 385 } 386 return guess; 387 }, 388 // update to the new guess 389 (oldGuess, newGuess) -> newGuess ); 390 } 391 392}