001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on December 19, 2013
021 * Author: Douglas Myers-Turnbull
022 */
023
024package org.biojava.nbio.structure;
025
026import java.io.IOException;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.LinkedList;
030import java.util.List;
031
032import org.biojava.nbio.structure.align.util.AtomCache;
033import org.biojava.nbio.structure.contact.Grid;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036/**
037 * This is the canonical way to identify a part of a structure.
038 *
039 * <p>The current syntax allows the specification of a set of residues from
040 * the first model of a structure. Future versions may be extended to represent
041 * additional properties.
042 *
043 * <p>Identifiers should adhere to the following specification, although some
044 * additional forms may be tolerated where unambiguous for backwards compatibility.
045 * <pre>
046 *              name          := pdbID
047 *                             | pdbID '.' chainID
048 *                             | pdbID '.' range
049 *              range         := range (',' range)?
050 *                             | chainID
051 *                             | chainID '_' resNum '-' resNum
052 *              pdbID         := [1-9][a-zA-Z0-9]{3}
053 *                             | PDB_[a-zA-Z0-9]{8}
054 *              chainID       := [a-zA-Z0-9]+
055 *              resNum        := [-+]?[0-9]+[A-Za-z]?
056 * </pre>
057 * For example:
058 * <pre>
059 *              1TIM                                    #whole structure (short format)
060 *              1tim                                    #same as above
061 *              4HHB.C                                  #single chain
062 *              3AA0.A,B                                #two chains
063 *              4GCR.A_1-40                             #substructure
064 *      3iek.A_17-28,A_56-294,A_320-377         #substructure of 3 disjoint parts
065 *              PDB_00001TIM                            #whole structure (extended format)
066 *              pdb_00001tim                            #same as above
067 *              PDB_00004HHB.C                          #single chain
068 *              PDB_00003AA0.A,B                        #two chains
069 *              PDB_00004GCR.A_1-40                     #substructure
070 *      pdb_00003iek.A_17-28,A_56-294,A_320-377 #substructure of 3 disjoint parts
071 * </pre>
072 * More options may be added to the specification at a future time.
073
074 * @author dmyersturnbull
075 * @author Spencer Bliven
076 */
077public class SubstructureIdentifier implements StructureIdentifier {
078
079        private static final long serialVersionUID = 1L;
080
081        private static final Logger logger = LoggerFactory.getLogger(SubstructureIdentifier.class);
082
083
084        private final PdbId pdbId;
085        private final List<ResidueRange> ranges;
086
087        /**
088         * Create a new identifier from a string.
089         * @param id
090         */
091        public SubstructureIdentifier(String id) {
092                String[] idRange = id.split("\\.");
093                if(1 > idRange.length || idRange.length > 2 ) {
094                        throw new IllegalArgumentException(String.format("Malformed %s: %s",getClass().getSimpleName(),id));
095                }
096                //used tempId to avoid writing 2 assignment statements to a final field,
097                // although one is in the try block and the other in the catch block.
098                PdbId tempId = null;
099                try {
100                        tempId = new PdbId(idRange[0]);
101                } catch (IllegalArgumentException e) {
102                        // Changed from Exception to a warning to support files and stuff -sbliven 2015/01/22
103                        logger.warn(String.format("Unrecognized PDB code %s", idRange[0]));
104                }
105                this.pdbId = tempId;
106                
107                if( idRange.length == 2) {
108                        String rangeStr = idRange[1].trim();
109
110                        this.ranges = ResidueRange.parseMultiple(rangeStr);
111                } else {
112                        this.ranges = new LinkedList<ResidueRange>();
113                }
114        }
115
116        /**
117         * Create a new identifier based on a set of ranges.
118         *
119         * If ranges is empty, includes all residues.
120         * @param pdbId
121         * @param ranges
122         */
123        public SubstructureIdentifier(String pdbId, List<ResidueRange> ranges) {
124                this(new PdbId(pdbId), ranges);
125        }
126
127        /**
128         * Create a new identifier based on a set of ranges.
129         *
130         * If ranges is empty, includes all residues.
131         * @param pdbId
132         * @param ranges
133         */
134        public SubstructureIdentifier(PdbId pdbId, List<ResidueRange> ranges) {
135                if(ranges == null) {
136                        throw new NullPointerException("Null ranges list");
137                }
138                this.pdbId = pdbId;
139                this.ranges = ranges;
140        }
141
142        @Override
143        public String toString() {
144                return getIdentifier();
145        }
146
147        /**
148         * Get the String form of this identifier.
149         *
150         * This provides the canonical form for a StructureIdentifier and has
151         * all the information needed to recreate a particular substructure.
152         *
153         * Example: 3iek.A_17-28,A_56-294
154         * @return The String form of this identifier
155         */
156        @Override
157        public String getIdentifier() {
158                String pdbId = this.pdbId == null? "": this.pdbId.getId();
159                if (ranges.isEmpty()) return pdbId;
160                return pdbId + "." + ResidueRange.toString(ranges);
161        }
162
163        /**
164         * Get the PDB identifier part of the SubstructureIdentifier
165         * @return the PDB ID
166         */
167        public PdbId getPdbId() {
168                return pdbId;
169        }
170        
171        public List<ResidueRange> getResidueRanges() {
172                return ranges;
173        }
174
175        /**
176         * Return itself. SubstructureIdentifiers are canonical!
177         */
178        @Override
179        public SubstructureIdentifier toCanonical() {
180                return this;
181        }
182
183        /**
184         * Takes a complete structure as input and reduces it to residues present in
185         * the specified ranges
186         *
187         * <p>The returned structure will be a shallow copy of the input, with shared
188         * Chains, Residues, etc.
189         *
190         * <p>Ligands are handled in a special way. If a full chain is selected
191         * (e.g. '1ABC.A') then any waters and ligands with matching chain name are
192         * included. If a residue range is present ('1ABC.A:1-100') then any
193         * ligands (technically non-water non-polymer atoms) within
194         * {@link StructureTools#DEFAULT_LIGAND_PROXIMITY_CUTOFF} of the selected
195         * range are included, regardless of chain.
196         * @param input A full structure, e.g. as loaded from the PDB. The structure
197         * ID should match that returned by getPdbId().
198         * @return
199         * @throws StructureException
200         * @see StructureTools#getReducedStructure(Structure, String)
201         */
202        @Override
203        public Structure reduce(Structure s) throws StructureException {
204                // Follows StructureImpl.clone()
205
206                if(s == null)
207                        throw new StructureException("NullPointerException Possibly due to malformed PIBId format.");
208
209                // Create new structure & copy basic properties
210                Structure newS = new StructureImpl();
211
212                newS.setPdbId(s.getPdbId());
213                newS.setPDBHeader(s.getPDBHeader());
214                newS.setName(this.toString());
215                newS.setDBRefs(s.getDBRefs());
216                newS.setBiologicalAssembly(s.isBiologicalAssembly());
217                newS.getPDBHeader().setDescription(
218                                "sub-range " + ranges + " of "  + newS.getPdbId() + " "
219                                                + s.getPDBHeader().getDescription());
220                newS.setEntityInfos(new ArrayList<>());
221                // TODO The following should be only copied for atoms which are present in the range.
222                newS.setSSBonds(s.getSSBonds());
223                newS.setSites(s.getSites());
224
225                newS.setStructureIdentifier(this);
226
227                for( int modelNr=0;modelNr<s.nrModels();modelNr++) {
228
229                        // Construct new model
230                        newS.addModel(new ArrayList<Chain>());
231
232                        if(getResidueRanges().isEmpty()) {
233                                // Include all residues
234                                newS.setEntityInfos(s.getEntityInfos());
235                                newS.setSSBonds(s.getSSBonds());
236                                newS.setSites(s.getSites());
237
238                                newS.setModel(modelNr, s.getModel(modelNr));
239                        } else {
240                                // Restrict residues
241                                for( ResidueRange range: getResidueRanges()) {
242
243                                        String chainName = range.getChainName();
244                                        ResidueNumber pdbresnum1 = range.getStart();
245                                        ResidueNumber pdbresnum2 = range.getEnd();
246
247//                                      StructureTools.addGroupsToStructure(newS, groups, modelNr, false);
248                                        Chain polyChain; //polymer
249                                        if(chainName.equals("_") ) {
250                                                // Handle special case of "_" chain for single-chain proteins
251                                                polyChain = s.getPolyChains(modelNr).get(0);
252                                                chainName = polyChain.getName();
253                                                if(pdbresnum1 != null)
254                                                        pdbresnum1.setChainName(chainName);
255                                                if(pdbresnum2 != null)
256                                                        pdbresnum2.setChainName(chainName);
257
258                                                if(s.getPolyChains().size() != 1) {
259                                                        // SCOP 1.71 uses this for some proteins with multiple chains
260                                                        // Print a warning in this ambiguous case
261                                                        logger.warn("Multiple possible chains match '_'. Using chain {}",chainName);
262                                                }
263                                        } else {
264                                                // Explicit chain
265                                                polyChain = s.getPolyChainByPDB(chainName,modelNr);
266                                                if( polyChain == null ) {
267                                                        // Chain not found
268                                                        // Maybe it was a chain index, masquerading as a chainName?
269                                                        try {
270                                                                int chainNum = Integer.parseInt(chainName);
271                                                                polyChain = s.getChainByIndex(modelNr, chainNum);
272                                                                chainName = polyChain.getName();
273                                                                if(pdbresnum1 != null)
274                                                                        pdbresnum1.setChainName(chainName);
275                                                                if(pdbresnum2 != null)
276                                                                        pdbresnum2.setChainName(chainName);
277                                                                logger.warn("No chain found for {}. Interpretting it as an index, using chain {} instead",chainName,polyChain.getId());
278                                                        } catch(NumberFormatException e3) {
279                                                                // Not an index. Throw the original exception
280                                                                throw new StructureException(String.format("Unrecognized chain %s in %s",chainName,getIdentifier()));
281                                                        }
282                                                }
283                                        }
284
285                                        if(pdbresnum1 == null && pdbresnum2 == null) {
286                                                // Include all atoms with matching chainName
287                                                StructureTools.addGroupsToStructure(newS, polyChain.getAtomGroups(), modelNr, false);
288                                                for(Chain chain : s.getNonPolyChainsByPDB(chainName, modelNr) ) {
289                                                        StructureTools.addGroupsToStructure(newS, chain.getAtomGroups(), modelNr, false);
290                                                }
291                                                Chain waters = s.getWaterChainByPDB(chainName, modelNr);
292                                                if( waters != null) {
293                                                        StructureTools.addGroupsToStructure(newS, waters.getAtomGroups(), modelNr, false);
294                                                }
295
296                                                // TODO do we need to prune SeqRes down to the atoms present? -SB 2016-10-7
297                                        } else {
298                                                // Include polymer range and any proximal ligands
299                                                List<Group> polygroups = Arrays.asList(polyChain.getGroupsByPDB(pdbresnum1, pdbresnum2));
300                                                StructureTools.addGroupsToStructure(newS, polygroups, modelNr, false);
301                                                copyLigandsByProximity(s,newS, StructureTools.DEFAULT_LIGAND_PROXIMITY_CUTOFF, modelNr, modelNr);
302                                        }
303                                } // end range
304                        }
305                } // end modelNr
306
307                return newS;
308        }
309
310        /**
311         * Loads the complete structure based on {@link #getPdbId()}.
312         *
313         * @param AtomCache A source of structures
314         * @return A Structure containing at least the atoms identified by this,
315         *  or null if no PDB ID is set
316         * @throws StructureException For errors loading and parsing the structure
317         * @throws IOException Errors reading the structure from disk
318         */
319        @Override
320        public Structure loadStructure(AtomCache cache) throws IOException, StructureException {
321                PdbId pdb = getPdbId();
322                if(pdb == null)
323                        return null;
324                return cache.getStructureForPdbId(pdb);
325        }
326
327        /**
328         * Supplements the reduced structure with ligands from the full structure based on
329         * a distance cutoff. Ligand groups are moved (destructively) from full to reduced
330         * if they fall within the cutoff of any atom in the reduced structure.
331         * The {@link StructureTools#DEFAULT_LIGAND_PROXIMITY_CUTOFF default cutoff}
332         * is used.
333         * @param full Structure containing all ligands
334         * @param reduced Structure with a subset of the polymer groups from full
335         * @see StructureTools#getLigandsByProximity(java.util.Collection, Atom[], double)
336         */
337        protected static void copyLigandsByProximity(Structure full, Structure reduced) {
338                // Normal case where all models should be copied from full to reduced
339                assert full.nrModels() >= reduced.nrModels();
340                for(int model = 0; model< reduced.nrModels(); model++) {
341                        copyLigandsByProximity(full, reduced, StructureTools.DEFAULT_LIGAND_PROXIMITY_CUTOFF, model, model);
342                }
343        }
344        /**
345         * Supplements the reduced structure with ligands from the full structure based on
346         * a distance cutoff. Ligand groups are moved (destructively) from full to reduced
347         * if they fall within the cutoff of any atom in the reduced structure.
348         * @param full Structure containing all ligands
349         * @param reduced Structure with a subset of the polymer groups from full
350         * @param cutoff Distance cutoff (Å)
351         * @param fromModel source model in full
352         * @param toModel destination model in reduced
353         * @see StructureTools#getLigandsByProximity(java.util.Collection, Atom[], double)
354         */
355        protected static void copyLigandsByProximity(Structure full, Structure reduced, double cutoff, int fromModel, int toModel) {
356                // Geometric hashing of the reduced structure
357                Grid grid = new Grid(cutoff);
358                Atom[] nonwaters = StructureTools.getAllNonHAtomArray(reduced,true,toModel);
359                if( nonwaters.length < 1 )
360                        return;
361                grid.addAtoms(nonwaters);
362
363                full.getNonPolyChains(fromModel).stream() //potential ligand chains
364                .flatMap((chain) -> chain.getAtomGroups().stream() ) // potential ligand groups
365                .filter( (g) -> !g.isWater() ) // ignore waters
366                .filter( (g) -> !g.isPolymeric() ) // already shouldn't be polymeric, but filter anyways
367                .filter( (g) -> grid.hasAnyContact(Calc.atomsToPoints(g.getAtoms())) ) // must contact reduced
368                .sequential() // Keeps ligands from the same chain together if possible
369                .reduce((Chain)null, // reduction updates the chain guess
370                                (guess, g ) -> {
371                                        boolean wasAdded;
372                                        try {
373                                                // Check that it's not in reduced already
374                                                wasAdded = reduced.findGroup(g.getChainId(),
375                                                                g.getResidueNumber().toString(), toModel) != null;
376                                        } catch (StructureException e) {
377                                                // not found
378                                                wasAdded = false;
379                                        }
380                                        if( !wasAdded ) {
381                                                // Add the ligand to reduced
382                                                // note this is not idempotent, but it is synchronized on reduced
383                                                logger.info("Adding ligand group {} {} by proximity",g.getPDBName(), g.getResidueNumber().toPDB());
384                                                return StructureTools.addGroupToStructure(reduced, g, toModel, guess, false);
385                                        }
386                                        return guess;
387                                },
388                                // update to the new guess
389                                (oldGuess, newGuess) -> newGuess );
390        }
391
392}