001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure;
022
023import java.io.IOException;
024import java.util.Collections;
025import java.util.List;
026
027import org.biojava.nbio.structure.align.util.AtomCache;
028import org.biojava.nbio.structure.io.MMCIFFileReader;
029import org.biojava.nbio.structure.io.PDBFileReader;
030import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
031import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
032import org.biojava.nbio.structure.quaternary.io.BioUnitDataProvider;
033import org.biojava.nbio.structure.quaternary.io.BioUnitDataProviderFactory;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036
037/**
038 * A class that provides static access methods for easy lookup of protein structure related components
039 *
040 * @author Andreas Prlic
041 *
042 * @since 3.0.5
043 */
044public class StructureIO {
045
046        private static final Logger logger = LoggerFactory.getLogger(StructureIO.class);
047
048        private static AtomCache cache ;
049
050
051        /** Loads a structure based on a name. Supported naming conventions are:
052         *
053         *  * <pre>
054                Formal specification for how to specify the <i>name</i>:
055
056                name     := pdbID
057                               | pdbID '.' chainID
058                               | pdbID '.' range
059                               | scopID
060                               | biol
061                               | pdp
062                range         := '('? range (',' range)? ')'?
063                               | chainID
064                               | chainID '_' resNum '-' resNum
065                pdbID         := [0-9][a-zA-Z0-9]{3}
066                chainID       := [a-zA-Z0-9]
067                scopID        := 'd' pdbID [a-z_][0-9_]
068                biol              := 'BIOL:' pdbID [:]? [0-9]+
069                pdp                       := 'PDP:' pdbID[A-Za-z0-9_]+
070                resNum        := [-+]?[0-9]+[A-Za-z]?
071
072
073                Example structures:
074                1TIM            #whole structure - asym unit
075                4HHB.C          #single chain
076                4GCR.A_1-83 #one domain, by residue number
077                3AA0.A,B    #two chains treated as one structure
078                d2bq6a1     #scop domain
079                BIOL:1fah   #biological assembly nr 1 for 1fah
080                BIOL:1fah:0 #asym unit for 1fah
081                BIOL:1fah:1 #biological assembly nr 1 for 1fah
082                BIOL:1fah:2 #biological assembly nr 2 for 1fah
083
084                </pre>
085         *
086         * With the additional set of rules:
087         *
088         *  <ul>
089         *  <li>If only a PDB code is provided, the whole structure will be return including ligands, but the first model only (for NMR).
090         *      <li>Chain IDs are case sensitive, PDB ids are not. To specify a particular chain write as: 4hhb.A or 4HHB.A </li>
091         *  <li>To specify a SCOP domain write a scopId e.g. d2bq6a1. Some flexibility can be allowed in SCOP domain names, see {@link #setStrictSCOP(boolean)}</li>
092         *  <li>URLs are accepted as well</li>
093         *  </ul>
094         *
095         * @param name
096         * @return a Structure object, or null if name appears improperly formated (eg too short, etc)
097         * @throws IOException The PDB file cannot be cached due to IO errors
098         * @throws StructureException The name appeared valid but did not correspond to a structure.
099         *      Also thrown by some submethods upon errors, eg for poorly formatted subranges.
100         */
101        public static Structure getStructure(String name) throws IOException, StructureException{
102
103                checkInitAtomCache();
104
105                // delegate this functionality to AtomCache...
106
107                return cache.getStructure(name);
108
109        }
110
111
112        private static void checkInitAtomCache() {
113                if ( cache == null){
114                        cache = new AtomCache();
115                }
116
117        }
118
119        public static void setAtomCache(AtomCache c){
120                cache = c;
121        }
122
123        /**
124         * Returns the first biologicalAssembly that is available for a protein structure. For more documentation on quaternary structures see:
125         * {@link http://www.pdb.org/pdb/101/static101.do?p=education_discussion/Looking-at-Structures/bioassembly_tutorial.html}
126         *
127         *
128         * @param pdbId
129         * @return a Structure object or null if that assembly is not available
130         * @throws StructureException
131         * @throws IOException
132         */
133        public static Structure getBiologicalAssembly(String pdbId) throws IOException, StructureException{
134
135                return getBiologicalAssembly(pdbId,1);
136        }
137
138        /**
139         * By default the getStructure method loads asym units. This access method allows to recreate the quaternary structure for a protein if it is available.
140         *
141         * @param pdbId
142         * @param biolAssemblyNr - the ith biological assembly that is available for a PDB ID (we start counting at 1, 0 represents the asym unit).
143         * @return a Structure object or null if that assembly is not available
144         * @throws StructureException if there is no bioassembly available for given biolAssemblyNr or some other problems encountered while loading it
145         * @throws IOException
146         */
147        public static Structure getBiologicalAssembly(String pdbId, int biolAssemblyNr) throws IOException, StructureException {
148                checkInitAtomCache();
149                return getBiologicalAssembly(pdbId,biolAssemblyNr,StructureIO.cache);
150        }
151        public static Structure getBiologicalAssembly(String pdbId, int biolAssemblyNr, AtomCache cache) throws IOException, StructureException {
152
153                BioUnitDataProvider provider = null;
154                try {
155                        provider = BioUnitDataProviderFactory.getBioUnitDataProvider();
156                        provider.setAtomCache(cache);
157                        Structure bio = getBiologicalAssembly(pdbId, biolAssemblyNr,cache,BioUnitDataProviderFactory.getBioUnitDataProvider());
158                        return bio;
159                } finally {
160                        if(provider != null) {
161                                //cleanup to avoid memory leaks
162                                provider.setAsymUnit(null);
163                                provider.setAtomCache(null);
164                        }
165                }
166        }
167        public static Structure getBiologicalAssembly(String pdbId, int biolAssemblyNr, AtomCache cache, BioUnitDataProvider provider) throws IOException, StructureException {
168
169                pdbId = pdbId.toLowerCase();
170
171
172
173                Structure asymUnit = provider.getAsymUnit(pdbId);
174
175                // 0 ... asym unit
176                if ( biolAssemblyNr == 0) {
177                        logger.info("Requested biological assembly 0 for PDB id "+pdbId+", returning asymmetric unit");
178                        return asymUnit;
179                }
180                // does it exist?
181                if (!asymUnit.getPDBHeader().getBioAssemblies().containsKey(biolAssemblyNr)) {
182                        throw new StructureException("No biological assembly available for biological assembly nr " + biolAssemblyNr + " of " + pdbId);
183                }
184
185                List<BiologicalAssemblyTransformation> transformations =
186                                asymUnit.getPDBHeader().getBioAssemblies().get(biolAssemblyNr).getTransforms();
187
188
189                if ( transformations == null || transformations.size() == 0){
190
191                        throw new StructureException("Could not load transformations to recreate biological assembly nr " + biolAssemblyNr + " of " + pdbId);
192                }
193                BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
194
195                return builder.rebuildQuaternaryStructure(asymUnit, transformations);
196        }
197
198        /**
199         * Does the provider PDB ID have a biological assembly?
200         *
201         * @param pdbId
202         * @return flag if one or more biological assemblies are available
203         */
204        public static boolean hasBiologicalAssembly(String pdbId){
205
206                pdbId = pdbId.toLowerCase();
207
208                BioUnitDataProvider provider = BioUnitDataProviderFactory.getBioUnitDataProvider();
209                checkInitAtomCache();
210                provider.setAtomCache(cache);
211                return provider.hasBiolAssembly(pdbId);
212
213        }
214
215        public static int getNrBiologicalAssemblies(String pdbId){
216
217                pdbId = pdbId.toLowerCase();
218
219                BioUnitDataProvider provider = BioUnitDataProviderFactory.getBioUnitDataProvider();
220                checkInitAtomCache();
221                provider.setAtomCache(cache);
222                return provider.getNrBiolAssemblies(pdbId);
223        }
224
225        private static final String FILE_SEPARATOR = System.getProperty("file.separator");
226
227        /**
228         * Utility method to set the location where PDB files can be found
229         *
230         * @param pathToPDBFiles
231         */
232        public static void setPdbPath(String pathToPDBFiles){
233
234                if ( ! pathToPDBFiles.endsWith(FILE_SEPARATOR))
235                        pathToPDBFiles += FILE_SEPARATOR;
236        }
237
238
239        public static enum StructureFiletype {
240                PDB( (new PDBFileReader()).getExtensions()),
241                CIF( new MMCIFFileReader().getExtensions()),
242                UNKNOWN(Collections.<String>emptyList());
243
244                private List<String> extensions;
245                /**
246                 * @param extensions List of supported extensions, including leading period
247                 */
248                private StructureFiletype(List<String> extensions) {
249                        this.extensions = extensions;
250                }
251                /**
252                 * @return a list of file extensions associated with this type
253                 */
254                public List<String> getExtensions() {
255                        return extensions;
256                }
257        }
258
259        /**
260         * Attempts to guess the type of a structure file based on the extension
261         * @param filename
262         * @return
263         */
264        public static StructureFiletype guessFiletype(String filename) {
265                String lower = filename.toLowerCase();
266                for(StructureFiletype type : StructureFiletype.values()) {
267                        for(String ext : type.getExtensions()) {
268                                if(lower.endsWith(ext.toLowerCase())) {
269                                        return type;
270                                }
271                        }
272                }
273                return StructureFiletype.UNKNOWN;
274        }
275}