001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on December 19, 2013 021 * Author: Douglas Myers-Turnbull 022 */ 023 024package org.biojava.nbio.structure; 025 026import java.io.IOException; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.LinkedList; 030import java.util.List; 031 032import org.biojava.nbio.structure.align.util.AtomCache; 033import org.biojava.nbio.structure.contact.Grid; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036 037/** 038 * This is the canonical way to identify a part of a structure. 039 * 040 * <p>The current syntax allows the specification of a set of residues from 041 * the first model of a structure. Future versions may be extended to represent 042 * additional properties. 043 * 044 * <p>Identifiers should adhere to the following specification, although some 045 * additional forms may be tolerated where unambiguous for backwards compatibility. 046 * <pre> 047 * name := pdbID 048 * | pdbID '.' chainID 049 * | pdbID '.' range 050 * range := range (',' range)? 051 * | chainID 052 * | chainID '_' resNum '-' resNum 053 * pdbID := [0-9][a-zA-Z0-9]{3} 054 * chainID := [a-zA-Z0-9]+ 055 * resNum := [-+]?[0-9]+[A-Za-z]? 056 * </pre> 057 * For example: 058 * <pre> 059 * 1TIM #whole structure 060 * 1tim #same as above 061 * 4HHB.C #single chain 062 * 3AA0.A,B #two chains 063 * 4GCR.A_1-40 #substructure 064 * 3iek.A_17-28,A_56-294,A_320-377 #substructure of 3 disjoint parts 065 * </pre> 066 * More options may be added to the specification at a future time. 067 068 * @author dmyersturnbull 069 * @author Spencer Bliven 070 */ 071public class SubstructureIdentifier implements StructureIdentifier { 072 073 private static final long serialVersionUID = 1L; 074 075 private static final Logger logger = LoggerFactory.getLogger(SubstructureIdentifier.class); 076 077 078 private final String pdbId; 079 private final List<ResidueRange> ranges; 080 081 /** 082 * Create a new identifier from a string. 083 * @param id 084 */ 085 public SubstructureIdentifier(String id) { 086 String[] idRange = id.split("\\."); 087 if(1 > idRange.length || idRange.length > 2 ) { 088 throw new IllegalArgumentException(String.format("Malformed %s: %s",getClass().getSimpleName(),id)); 089 } 090 if(idRange[0].length() != 4) { 091 this.pdbId = idRange[0]; 092 // Changed from Exception to a warning to support files and stuff -sbliven 2015/01/22 093 logger.warn(String.format("Unrecognized PDB code %s",this.pdbId)); 094 } else { 095 this.pdbId = idRange[0].toUpperCase(); 096 } 097 098 if( idRange.length == 2) { 099 String rangeStr = idRange[1].trim(); 100 101 this.ranges = ResidueRange.parseMultiple(rangeStr); 102 } else { 103 this.ranges = new LinkedList<ResidueRange>(); 104 } 105 } 106 107 /** 108 * Create a new identifier based on a set of ranges. 109 * 110 * If ranges is empty, includes all residues. 111 * @param pdbId 112 * @param ranges 113 */ 114 public SubstructureIdentifier(String pdbId, List<ResidueRange> ranges) { 115 if(ranges == null) { 116 throw new NullPointerException("Null ranges list"); 117 } 118 this.pdbId = pdbId; 119 this.ranges = ranges; 120 } 121 122 @Override 123 public String toString() { 124 return getIdentifier(); 125 } 126 127 /** 128 * Get the String form of this identifier. 129 * 130 * This provides the canonical form for a StructureIdentifier and has 131 * all the information needed to recreate a particular substructure. 132 * 133 * Example: 3iek.A_17-28,A_56-294 134 * @return The String form of this identifier 135 */ 136 @Override 137 public String getIdentifier() { 138 if (ranges.isEmpty()) return pdbId; 139 return pdbId + "." + ResidueRange.toString(ranges); 140 } 141 142 public String getPdbId() { 143 return pdbId; 144 } 145 146 public List<ResidueRange> getResidueRanges() { 147 return ranges; 148 } 149 150 /** 151 * Return itself. SubstructureIdentifiers are canonical! 152 */ 153 @Override 154 public SubstructureIdentifier toCanonical() { 155 return this; 156 } 157 158 /** 159 * Takes a complete structure as input and reduces it to residues present in 160 * the specified ranges 161 * 162 * <p>The returned structure will be a shallow copy of the input, with shared 163 * Chains, Residues, etc. 164 * 165 * <p>Ligands are handled in a special way. If a full chain is selected 166 * (e.g. '1ABC.A') then any waters and ligands with matching chain name are 167 * included. If a residue range is present ('1ABC.A:1-100') then any 168 * ligands (technically non-water non-polymer atoms) within 169 * {@link StructureTools#DEFAULT_LIGAND_PROXIMITY_CUTOFF} of the selected 170 * range are included, regardless of chain. 171 * @param input A full structure, e.g. as loaded from the PDB. The structure 172 * ID should match that returned by getPdbId(). 173 * @return 174 * @throws StructureException 175 * @see StructureTools#getReducedStructure(Structure, String) 176 */ 177 @Override 178 public Structure reduce(Structure s) throws StructureException { 179 // Follows StructureImpl.clone() 180 181 // Create new structure & copy basic properties 182 Structure newS = new StructureImpl(); 183 184 newS.setPDBCode(s.getPDBCode()); 185 newS.setPDBHeader(s.getPDBHeader()); 186 newS.setName(this.toString()); 187 newS.setDBRefs(s.getDBRefs()); 188 newS.setBiologicalAssembly(s.isBiologicalAssembly()); 189 newS.getPDBHeader().setDescription( 190 "sub-range " + ranges + " of " + newS.getPDBCode() + " " 191 + s.getPDBHeader().getDescription()); 192 newS.setEntityInfos(new ArrayList<>()); 193 // TODO The following should be only copied for atoms which are present in the range. 194 newS.setSSBonds(s.getSSBonds()); 195 newS.setSites(s.getSites()); 196 197 newS.setStructureIdentifier(this); 198 199 for( int modelNr=0;modelNr<s.nrModels();modelNr++) { 200 201 // Construct new model 202 newS.addModel(new ArrayList<Chain>()); 203 204 if(getResidueRanges().isEmpty()) { 205 // Include all residues 206 newS.setEntityInfos(s.getEntityInfos()); 207 newS.setSSBonds(s.getSSBonds()); 208 newS.setSites(s.getSites()); 209 210 newS.setModel(modelNr, s.getModel(modelNr)); 211 } else { 212 // Restrict residues 213 for( ResidueRange range: getResidueRanges()) { 214 215 String chainName = range.getChainName(); 216 ResidueNumber pdbresnum1 = range.getStart(); 217 ResidueNumber pdbresnum2 = range.getEnd(); 218 219// StructureTools.addGroupsToStructure(newS, groups, modelNr, false); 220 Chain polyChain; //polymer 221 if(chainName.equals("_") ) { 222 // Handle special case of "_" chain for single-chain proteins 223 polyChain = s.getPolyChains(modelNr).get(0); 224 chainName = polyChain.getName(); 225 if(pdbresnum1 != null) 226 pdbresnum1.setChainName(chainName); 227 if(pdbresnum2 != null) 228 pdbresnum2.setChainName(chainName); 229 230 if(s.getPolyChains().size() != 1) { 231 // SCOP 1.71 uses this for some proteins with multiple chains 232 // Print a warning in this ambiguous case 233 logger.warn("Multiple possible chains match '_'. Using chain {}",chainName); 234 } 235 } else { 236 // Explicit chain 237 polyChain = s.getPolyChainByPDB(chainName,modelNr); 238 if( polyChain == null ) { 239 // Chain not found 240 // Maybe it was a chain index, masquerading as a chainName? 241 try { 242 int chainNum = Integer.parseInt(chainName); 243 polyChain = s.getChainByIndex(modelNr, chainNum); 244 chainName = polyChain.getName(); 245 if(pdbresnum1 != null) 246 pdbresnum1.setChainName(chainName); 247 if(pdbresnum2 != null) 248 pdbresnum2.setChainName(chainName); 249 logger.warn("No chain found for {}. Interpretting it as an index, using chain {} instead",chainName,polyChain.getId()); 250 } catch(NumberFormatException e3) { 251 // Not an index. Throw the original exception 252 throw new StructureException(String.format("Unrecognized chain %s in %s",chainName,getIdentifier())); 253 } 254 } 255 } 256 257 if(pdbresnum1 == null && pdbresnum2 == null) { 258 // Include all atoms with matching chainName 259 StructureTools.addGroupsToStructure(newS, polyChain.getAtomGroups(), modelNr, false); 260 for(Chain chain : s.getNonPolyChainsByPDB(chainName, modelNr) ) { 261 StructureTools.addGroupsToStructure(newS, chain.getAtomGroups(), modelNr, false); 262 } 263 Chain waters = s.getWaterChainByPDB(chainName, modelNr); 264 if( waters != null) { 265 StructureTools.addGroupsToStructure(newS, waters.getAtomGroups(), modelNr, false); 266 } 267 268 // TODO do we need to prune SeqRes down to the atoms present? -SB 2016-10-7 269 } else { 270 // Include polymer range and any proximal ligands 271 List<Group> polygroups = Arrays.asList(polyChain.getGroupsByPDB(pdbresnum1, pdbresnum2)); 272 StructureTools.addGroupsToStructure(newS, polygroups, modelNr, false); 273 copyLigandsByProximity(s,newS, StructureTools.DEFAULT_LIGAND_PROXIMITY_CUTOFF, modelNr, modelNr); 274 } 275 } // end range 276 } 277 } // end modelNr 278 279 return newS; 280 } 281 282 /** 283 * Loads the complete structure based on {@link #getPdbId()}. 284 * 285 * @param AtomCache A source of structures 286 * @return A Structure containing at least the atoms identified by this, 287 * or null if no PDB ID is set 288 * @throws StructureException For errors loading and parsing the structure 289 * @throws IOException Errors reading the structure from disk 290 */ 291 @Override 292 public Structure loadStructure(AtomCache cache) throws IOException, StructureException { 293 String pdb = getPdbId(); 294 if(pdb == null) 295 return null; 296 return cache.getStructureForPdbId(pdb); 297 } 298 299 /** 300 * Supplements the reduced structure with ligands from the full structure based on 301 * a distance cutoff. Ligand groups are moved (destructively) from full to reduced 302 * if they fall within the cutoff of any atom in the reduced structure. 303 * The {@link StructureTools#DEFAULT_LIGAND_PROXIMITY_CUTOFF default cutoff} 304 * is used. 305 * @param full Structure containing all ligands 306 * @param reduced Structure with a subset of the polymer groups from full 307 * @see StructureTools#getLigandsByProximity(java.util.Collection, Atom[], double) 308 */ 309 protected static void copyLigandsByProximity(Structure full, Structure reduced) { 310 // Normal case where all models should be copied from full to reduced 311 assert full.nrModels() >= reduced.nrModels(); 312 for(int model = 0; model< reduced.nrModels(); model++) { 313 copyLigandsByProximity(full, reduced, StructureTools.DEFAULT_LIGAND_PROXIMITY_CUTOFF, model, model); 314 } 315 } 316 /** 317 * Supplements the reduced structure with ligands from the full structure based on 318 * a distance cutoff. Ligand groups are moved (destructively) from full to reduced 319 * if they fall within the cutoff of any atom in the reduced structure. 320 * @param full Structure containing all ligands 321 * @param reduced Structure with a subset of the polymer groups from full 322 * @param cutoff Distance cutoff (Å) 323 * @param fromModel source model in full 324 * @param toModel destination model in reduced 325 * @see StructureTools#getLigandsByProximity(java.util.Collection, Atom[], double) 326 */ 327 protected static void copyLigandsByProximity(Structure full, Structure reduced, double cutoff, int fromModel, int toModel) { 328 // Geometric hashing of the reduced structure 329 Grid grid = new Grid(cutoff); 330 Atom[] nonwaters = StructureTools.getAllNonHAtomArray(reduced,true,toModel); 331 if( nonwaters.length < 1 ) 332 return; 333 grid.addAtoms(nonwaters); 334 335 full.getNonPolyChains(fromModel).stream() //potential ligand chains 336 .flatMap((chain) -> chain.getAtomGroups().stream() ) // potential ligand groups 337 .filter( (g) -> !g.isWater() ) // ignore waters 338 .filter( (g) -> !g.isPolymeric() ) // already shouldn't be polymeric, but filter anyways 339 .filter( (g) -> grid.hasAnyContact(Calc.atomsToPoints(g.getAtoms())) ) // must contact reduced 340 .sequential() // Keeps ligands from the same chain together if possible 341 .reduce((Chain)null, // reduction updates the chain guess 342 (guess, g ) -> { 343 boolean wasAdded; 344 try { 345 // Check that it's not in reduced already 346 wasAdded = reduced.findGroup(g.getChainId(), 347 g.getResidueNumber().toString(), toModel) != null; 348 } catch (StructureException e) { 349 // not found 350 wasAdded = false; 351 } 352 if( !wasAdded ) { 353 // Add the ligand to reduced 354 // note this is not idempotent, but it is synchronized on reduced 355 logger.info("Adding ligand group {} {} by proximity",g.getPDBName(), g.getResidueNumber().toPDB()); 356 return StructureTools.addGroupToStructure(reduced, g, toModel, guess, false); 357 } 358 return guess; 359 }, 360 // update to the new guess 361 (oldGuess, newGuess) -> newGuess ); 362 } 363 364}