001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on December 19, 2013 021 * Author: Douglas Myers-Turnbull 022 */ 023 024package org.biojava.nbio.structure; 025 026import java.io.IOException; 027import java.io.Serializable; 028import java.util.ArrayList; 029import java.util.Arrays; 030import java.util.LinkedList; 031import java.util.List; 032 033import org.biojava.nbio.structure.align.util.AtomCache; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036 037/** 038 * This is the canonical way to identify a part of a structure. 039 * 040 * <p>The current syntax allows the specification of a set of residues from 041 * the first model of a structure. Future versions may be extended to represent 042 * additional properties. 043 * 044 * <p>Identifiers should adhere to the following specification, although some 045 * additional forms may be tolerated where unambiguous for backwards compatibility. 046 * <pre> 047 * name := pdbID 048 * | pdbID '.' chainID 049 * | pdbID '.' range 050 * range := range (',' range)? 051 * | chainID 052 * | chainID '_' resNum '-' resNum 053 * pdbID := [0-9][a-zA-Z0-9]{3} 054 * chainID := [a-zA-Z0-9]+ 055 * resNum := [-+]?[0-9]+[A-Za-z]? 056 * </pre> 057 * For example: 058 * <pre> 059 * 1TIM #whole structure 060 * 1tim #same as above 061 * 4HHB.C #single chain 062 * 3AA0.A,B #two chains 063 * 4GCR.A_1-40 #substructure 064 * 3iek.A_17-28,A_56-294,A_320-377 #substructure of 3 disjoint parts 065 * </pre> 066 * More options may be added to the specification at a future time. 067 068 * @author dmyersturnbull 069 * @author Spencer Bliven 070 */ 071public class SubstructureIdentifier implements Serializable, StructureIdentifier { 072 073 private static final long serialVersionUID = 1L; 074 075 private static final Logger logger = LoggerFactory.getLogger(SubstructureIdentifier.class); 076 077 private final String pdbId; 078 private final List<ResidueRange> ranges; 079 080 /** 081 * Create a new identifier from a string. 082 * @param id 083 */ 084 public SubstructureIdentifier(String id) { 085 String[] idRange = id.split("\\."); 086 if(1 > idRange.length || idRange.length > 2 ) { 087 throw new IllegalArgumentException(String.format("Malformed %s: %s",getClass().getSimpleName(),id)); 088 } 089 if(idRange[0].length() != 4) { 090 this.pdbId = idRange[0]; 091 // Changed from Exception to a warning to support files and stuff -sbliven 2015/01/22 092 logger.warn(String.format("Unrecognized PDB code %s",this.pdbId)); 093 } else { 094 this.pdbId = idRange[0].toUpperCase(); 095 } 096 097 if( idRange.length == 2) { 098 String rangeStr = idRange[1].trim(); 099 100 this.ranges = ResidueRange.parseMultiple(rangeStr); 101 } else { 102 this.ranges = new LinkedList<ResidueRange>(); 103 } 104 } 105 106 /** 107 * Create a new identifier based on a set of ranges. 108 * 109 * If ranges is empty, includes all residues. 110 * @param pdbId 111 * @param ranges 112 */ 113 public SubstructureIdentifier(String pdbId, List<ResidueRange> ranges) { 114 if(ranges == null) { 115 throw new NullPointerException("Null ranges list"); 116 } 117 this.pdbId = pdbId; 118 this.ranges = ranges; 119 } 120 121 @Override 122 public String toString() { 123 return getIdentifier(); 124 } 125 126 /** 127 * Get the String form of this identifier. 128 * 129 * This provides the canonical form for a StructureIdentifier and has 130 * all the information needed to recreate a particular substructure. 131 * 132 * Example: 3iek.A_17-28,A_56-294 133 * @return The String form of this identifier 134 */ 135 @Override 136 public String getIdentifier() { 137 if (ranges.isEmpty()) return pdbId; 138 return pdbId + "." + ResidueRange.toString(ranges); 139 } 140 141 public String getPdbId() { 142 return pdbId; 143 } 144 145 public List<ResidueRange> getResidueRanges() { 146 return ranges; 147 } 148 149 /** 150 * Return itself. SubstructureIdentifiers are canonical! 151 */ 152 @Override 153 public SubstructureIdentifier toCanonical() { 154 return this; 155 } 156 157 /** 158 * Takes a complete structure as input and reduces it to residues present in 159 * the specified ranges 160 * 161 * <p>The returned structure will be a shallow copy of the input, with shared 162 * Chains, Residues, etc. 163 * @param input A full structure, e.g. as loaded from the PDB. The structure 164 * ID should match that returned by getPdbId(). 165 * @return 166 * @throws StructureException 167 * @see StructureTools#getReducedStructure(Structure, String) 168 */ 169 @Override 170 public Structure reduce(Structure s) throws StructureException { 171 // Follows StructureImpl.clone() 172 173 // Create new structure & copy basic properties 174 Structure newS = new StructureImpl(); 175 176 newS.setPDBCode(s.getPDBCode()); 177 newS.setPDBHeader(s.getPDBHeader()); 178 newS.setName(this.toString()); 179 newS.setDBRefs(s.getDBRefs()); 180 newS.setBiologicalAssembly(s.isBiologicalAssembly()); 181 newS.getPDBHeader().setDescription( 182 "sub-range " + ranges + " of " + newS.getPDBCode() + " " 183 + s.getPDBHeader().getDescription()); 184 // TODO The following should be only copied for atoms which are present in the range. 185 newS.setCompounds(s.getCompounds()); 186 187 newS.setSSBonds(s.getSSBonds()); 188 newS.setSites(s.getSites()); 189 190 newS.setStructureIdentifier(this); 191 192 for( int modelNr=0;modelNr<s.nrModels();modelNr++) { 193 String prevChainId = null; 194 195 196 // Construct new model 197 newS.addModel(new ArrayList<Chain>()); 198 199 if(getResidueRanges().isEmpty()) { 200 // Include all residues 201 newS.setCompounds(s.getCompounds()); 202 newS.setSSBonds(s.getSSBonds()); 203 newS.setSites(s.getSites()); 204 205 newS.setModel(modelNr, s.getModel(modelNr)); 206 } else { 207 // Restrict residues 208 for( ResidueRange range: getResidueRanges()) { 209 210 String chainId = range.getChainId(); 211 ResidueNumber pdbresnum1 = range.getStart(); 212 ResidueNumber pdbresnum2 = range.getEnd(); 213 214 Chain chain; 215 if(chainId.equals("_") ) { 216 // Handle special case of "_" chain for single-chain proteins 217 chain = s.getChain(modelNr,0); 218 if(pdbresnum1 != null) 219 pdbresnum1.setChainId(chain.getChainID()); 220 if(pdbresnum2 != null) 221 pdbresnum2.setChainId(chain.getChainID()); 222 223 if(s.size() != 1) { 224 // SCOP 1.71 uses this for some proteins with multiple chains 225 // Print a warning in this ambiguous case 226 logger.warn("Multiple possible chains match '_'. Using chain {}",chain.getChainID()); 227 } 228 } else { 229 // Explicit chain 230 try { 231 chain = s.getChainByPDB(chainId,modelNr); 232 } catch(StructureException e) { 233 // Chain not found 234 // Maybe it was a chain index, masquerading as a chainId? 235 try { 236 int chainNum = Integer.parseInt(chainId); 237 try { 238 chain = s.getChain(modelNr, chainNum); 239 logger.warn("No chain found for {}. Interpretting it as an index, using chain {} instead",chainId,chain.getChainID()); 240 } catch(Exception e2) { //we don't care what gets thrown here -sbliven 241 throw e; // Nope, not an index. Throw the original exception 242 } 243 } catch(NumberFormatException e3) { 244 // Not an index. Throw the original exception 245 throw e; 246 } 247 } 248 } 249 250 List<Group> groups; 251 if(pdbresnum1 == null && pdbresnum2 == null) { 252 groups = chain.getAtomGroups(); 253 } else { 254// // Trim extra residues off the range 255// Atom[] allAtoms = StructureTools.getRepresentativeAtomArray(chain); 256// AtomPositionMap map = new AtomPositionMap(allAtoms); 257// ResidueRange trimmed = map.trimToValidResidues( 258// new ResidueRange(chain.getChainID(), 259// pdbresnum1, pdbresnum2)); 260// if (trimmed != null) { 261// pdbresnum1 = trimmed.getStart(); 262// pdbresnum2 = trimmed.getEnd(); 263// } 264 groups = Arrays.asList(chain.getGroupsByPDB(pdbresnum1, pdbresnum2)); 265 } 266 267 Chain c = null; 268 269 // Reuse prevChain 270 if ( prevChainId != null && prevChainId.equals(chain.getChainID())) { 271 c = newS.getChainByPDB(prevChainId,modelNr); 272 } else { 273 try { 274 c = newS.getChainByPDB(chain.getChainID(),modelNr); 275 } catch (StructureException e){ 276 // chain not in structure yet... 277 } 278 } 279 // Create new chain 280 if ( c == null) { 281 // first chain... 282 c = new ChainImpl(); 283 c.setChainID(chain.getChainID()); 284 newS.addChain(c,modelNr); 285 c.setSeqResGroups(chain.getSeqResGroups()); 286 c.setSeqMisMatches(chain.getSeqMisMatches()); 287 } 288 289 // add the groups to the chain: 290 for ( Group g: groups) { 291 c.addGroup(g); 292 } 293 294 prevChainId = c.getChainID(); 295 } // end range 296 } 297 } // end modelNr 298 299 return newS; 300 } 301 302 /** 303 * Loads the complete structure based on {@link #getPdbId()}. 304 * 305 * @param AtomCache A source of structures 306 * @return A Structure containing at least the atoms identified by this, 307 * or null if no PDB ID is set 308 * @throws StructureException For errors loading and parsing the structure 309 * @throws IOException Errors reading the structure from disk 310 */ 311 @Override 312 public Structure loadStructure(AtomCache cache) throws IOException, StructureException { 313 String pdb = getPdbId(); 314 if(pdb == null) 315 return null; 316 return cache.getStructureForPdbId(pdb); 317 } 318 319}