Source code

001/*
002 *                  BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on Jan 4, 2006
021 *
022 */
023package org.biojava.nbio.structure;
024
025import java.io.File;
026import java.io.FileInputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.util.ArrayList;
030import java.util.HashMap;
031import java.util.HashSet;
032import java.util.Iterator;
033import java.util.LinkedHashSet;
034import java.util.List;
035import java.util.Map;
036import java.util.Set;
037import org.biojava.nbio.structure.align.util.AtomCache;
038import org.biojava.nbio.structure.contact.AtomContactSet;
039import org.biojava.nbio.structure.contact.Grid;
040import org.biojava.nbio.structure.io.FileParsingParameters;
041import org.biojava.nbio.structure.io.PDBFileParser;
042import org.biojava.nbio.structure.io.mmcif.chem.PolymerType;
043import org.biojava.nbio.structure.io.mmcif.chem.ResidueType;
044import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
045import org.biojava.nbio.structure.io.util.FileDownloadUtils;
046import org.slf4j.Logger;
047import org.slf4j.LoggerFactory;
048
049/**
050 * A class that provides some tool methods.
051 *
052 * @author Andreas Prlic, Jules Jacobsen
053 * @since 1.0
054 */
055public class StructureTools {
056
057        private static final Logger logger = LoggerFactory
058                        .getLogger(StructureTools.class);
059
060        // Amino Acid backbone
061        /**
062         * The atom name of the backbone C-alpha atom. Note that this can be
063         * ambiguous depending on the context since Calcium atoms use the same name
064         * in PDB.
065         */
066        public static final String CA_ATOM_NAME = "CA";
067
068        /**
069         * The atom name for the backbone amide nitrogen
070         */
071        public static final String N_ATOM_NAME = "N";
072
073        /**
074         * The atom name for the backbone carbonyl
075         */
076        public static final String C_ATOM_NAME = "C";
077
078        /**
079         * The atom name for the backbone carbonyl oxygen
080         */
081        public static final String O_ATOM_NAME = "O";
082
083        /**
084         * The atom name of the side-chain C-beta atom
085         */
086        public static final String CB_ATOM_NAME = "CB";
087
088        // Nucleotide backbone
089        /**
090         * The atom name of the backbone C1' in RNA
091         */
092        public static final String C1_ATOM_NAME = "C1'";
093        /**
094         * The atom name of the backbone C2' in RNA
095         */
096        public static final String C2_ATOM_NAME = "C2'";
097        /**
098         * The atom name of the backbone C3' in RNA
099         */
100        public static final String C3_ATOM_NAME = "C3'";
101        /**
102         * The atom name of the backbone C4' in RNA
103         */
104        public static final String C4_ATOM_NAME = "C4'";
105        /**
106         * The atom name of the backbone O2' in RNA
107         */
108        public static final String O2_ATOM_NAME = "O2'";
109        /**
110         * The atom name of the backbone O3' in RNA
111         */
112        public static final String O3_ATOM_NAME = "O3'";
113        /**
114         * The atom name of the backbone O4' in RNA
115         */
116        public static final String O4_ATOM_NAME = "O4'";
117        /**
118         * The atom name of the backbone O4' in RNA
119         */
120        public static final String O5_ATOM_NAME = "O5'";
121        /**
122         * The atom name of the backbone O4' in RNA
123         */
124        public static final String OP1_ATOM_NAME = "OP1";
125        /**
126         * The atom name of the backbone O4' in RNA
127         */
128        public static final String OP2_ATOM_NAME = "OP2";
129        /**
130         * The atom name of the backbone phosphate in RNA
131         */
132        public static final String P_ATOM_NAME = "P";
133
134        /**
135         * The atom used as representative for nucleotides, equivalent to
136         * {@link #CA_ATOM_NAME} for proteins
137         */
138        public static final String NUCLEOTIDE_REPRESENTATIVE = P_ATOM_NAME;
139
140        /**
141         * The character to use for unknown compounds in sequence strings
142         */
143        public static final char UNKNOWN_GROUP_LABEL = 'X';
144
145        /**
146         * Below this ratio of aminoacid/nucleotide residues to the sequence total,
147         * we use simple majority of aminoacid/nucleotide residues to decide the
148         * character of the chain (protein/nucleotide)
149         */
150        public static final double RATIO_RESIDUES_TO_TOTAL = 0.95;
151
152        // there is a file format change in PDB 3.0 and nucleotides are being
153        // renamed
154        private static final Map<String, Character> nucleotides30;
155        private static final Map<String, Character> nucleotides23;
156
157        // amino acid 3 and 1 letter code definitions
158        private static final Map<String, Character> aminoAcids;
159
160        private static final Set<Element> hBondDonorAcceptors;
161
162        static {
163                nucleotides30 = new HashMap<String, Character>();
164                nucleotides30.put("DA", 'A');
165                nucleotides30.put("DC", 'C');
166                nucleotides30.put("DG", 'G');
167                nucleotides30.put("DT", 'T');
168                nucleotides30.put("DI", 'I');
169                nucleotides30.put("A", 'A');
170                nucleotides30.put("G", 'G');
171                nucleotides30.put("C", 'C');
172                nucleotides30.put("U", 'U');
173                nucleotides30.put("I", 'I');
174
175                // the DNA linkers - the +C , +G, +A +T +U and +I have been replaced
176                // with these:
177                nucleotides30.put("TAF", UNKNOWN_GROUP_LABEL); // Fluorinated Thymine
178                nucleotides30.put("TC1", UNKNOWN_GROUP_LABEL); // Furanosyl
179                nucleotides30.put("TFE", UNKNOWN_GROUP_LABEL); // Fluorinated Thymine
180                nucleotides30.put("TFO", UNKNOWN_GROUP_LABEL); // Tenofovir (3'
181                                                                                                                // terminator)
182                nucleotides30.put("TGP", UNKNOWN_GROUP_LABEL); // Guanine variant
183                nucleotides30.put("THX", UNKNOWN_GROUP_LABEL); // 5' terminator
184                nucleotides30.put("TLC", UNKNOWN_GROUP_LABEL); // Thymine with dicyclic
185                                                                                                                // sugar
186                nucleotides30.put("TLN", UNKNOWN_GROUP_LABEL); // locked Thymine
187                nucleotides30.put("LCG", UNKNOWN_GROUP_LABEL); // locked Guanine
188                nucleotides30.put("TP1", UNKNOWN_GROUP_LABEL); // Thymine peptide
189                                                                                                                // nucleic acid, with
190                                                                                                                // added methyl
191                nucleotides30.put("CP1", UNKNOWN_GROUP_LABEL); // Cytidine peptide
192                                                                                                                // nucleic acid, with
193                                                                                                                // added methyl
194                nucleotides30.put("TPN", UNKNOWN_GROUP_LABEL); // Thymine peptide
195                                                                                                                // nucleic acid
196                nucleotides30.put("CPN", UNKNOWN_GROUP_LABEL); // Cytidine peptide
197                                                                                                                // nucleic acid
198                nucleotides30.put("GPN", UNKNOWN_GROUP_LABEL); // Guanine peptide
199                                                                                                                // nucleic acid
200                nucleotides30.put("APN", UNKNOWN_GROUP_LABEL); // Adenosine peptide
201                                                                                                                // nucleic acid
202                nucleotides30.put("TPC", UNKNOWN_GROUP_LABEL); // Thymine variant
203
204                // store nucleic acids (C, G, A, T, U, and I), and
205                // the modified versions of nucleic acids (+C, +G, +A, +T, +U, and +I),
206                // and
207                nucleotides23 = new HashMap<String, Character>();
208                String[] names = { "C", "G", "A", "T", "U", "I", "+C", "+G", "+A",
209                                "+T", "+U", "+I" };
210                for (String n : names) {
211                        nucleotides23.put(n, n.charAt(n.length() - 1));
212                }
213
214                aminoAcids = new HashMap<String, Character>();
215                aminoAcids.put("GLY", 'G');
216                aminoAcids.put("ALA", 'A');
217                aminoAcids.put("VAL", 'V');
218                aminoAcids.put("LEU", 'L');
219                aminoAcids.put("ILE", 'I');
220                aminoAcids.put("PHE", 'F');
221                aminoAcids.put("TYR", 'Y');
222                aminoAcids.put("TRP", 'W');
223                aminoAcids.put("PRO", 'P');
224                aminoAcids.put("HIS", 'H');
225                aminoAcids.put("LYS", 'K');
226                aminoAcids.put("ARG", 'R');
227                aminoAcids.put("SER", 'S');
228                aminoAcids.put("THR", 'T');
229                aminoAcids.put("GLU", 'E');
230                aminoAcids.put("GLN", 'Q');
231                aminoAcids.put("ASP", 'D');
232                aminoAcids.put("ASN", 'N');
233                aminoAcids.put("CYS", 'C');
234                aminoAcids.put("MET", 'M');
235                // MSE is only found as a molecular replacement for MET
236                aminoAcids.put("MSE", 'M');
237                // 'non-standard', genetically encoded
238                // http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
239                // IUBMB recommended name is 'SEC' but the wwPDB currently use 'CSE'
240                // likewise 'PYL' (IUBMB) and 'PYH' (PDB)
241                aminoAcids.put("CSE", 'U');
242                aminoAcids.put("SEC", 'U');
243                aminoAcids.put("PYH", 'O');
244                aminoAcids.put("PYL", 'O');
245
246                hBondDonorAcceptors = new HashSet<Element>();
247                hBondDonorAcceptors.add(Element.N);
248                hBondDonorAcceptors.add(Element.O);
249                hBondDonorAcceptors.add(Element.S);
250
251        }
252
253        /**
254         * Count how many Atoms are contained within a Structure object.
255         *
256         * @param s
257         *            the structure object
258         * @return the number of Atoms in this Structure
259         */
260        public static final int getNrAtoms(Structure s) {
261
262                int nrAtoms = 0;
263
264                Iterator<Group> iter = new GroupIterator(s);
265
266                while (iter.hasNext()) {
267                        Group g = iter.next();
268                        nrAtoms += g.size();
269                }
270
271                return nrAtoms;
272        }
273
274        /**
275         * Count how many groups are contained within a structure object.
276         *
277         * @param s
278         *            the structure object
279         * @return the number of groups in the structure
280         */
281        public static final int getNrGroups(Structure s) {
282                int nrGroups = 0;
283
284                List<Chain> chains = s.getChains(0);
285                for (Chain c : chains) {
286                        nrGroups += c.getAtomLength();
287                }
288                return nrGroups;
289        }
290
291        /**
292         * Returns an array of the requested Atoms from the Structure object.
293         * Iterates over all groups and checks if the requested atoms are in this
294         * group, no matter if this is a {@link AminoAcid} or {@link HetatomImpl}
295         * group. If the group does not contain all requested atoms then no atoms
296         * are added for that group. For structures with more than one model, only
297         * model 0 will be used.
298         *
299         * @param s
300         *            the structure to get the atoms from
301         *
302         * @param atomNames
303         *            contains the atom names to be used.
304         * @return an Atom[] array
305         */
306        public static final Atom[] getAtomArray(Structure s, String[] atomNames) {
307                List<Chain> chains = s.getModel(0);
308
309                List<Atom> atoms = new ArrayList<Atom>();
310
311                extractAtoms(atomNames, chains, atoms);
312                return atoms.toArray(new Atom[atoms.size()]);
313
314        }
315
316        /**
317         * Returns an array of the requested Atoms from the Structure object. In
318         * contrast to {@link #getAtomArray(Structure, String[])} this method
319         * iterates over all chains. Iterates over all chains and groups and checks
320         * if the requested atoms are in this group, no matter if this is a
321         * {@link AminoAcid} or {@link HetatomImpl} group. If the group does not
322         * contain all requested atoms then no atoms are added for that group. For
323         * structures with more than one model, only model 0 will be used.
324         *
325         * @param s
326         *            the structure to get the atoms from
327         *
328         * @param atomNames
329         *            contains the atom names to be used.
330         * @return an Atom[] array
331         */
332        public static final Atom[] getAtomArrayAllModels(Structure s,
333                        String[] atomNames) {
334
335                List<Atom> atoms = new ArrayList<Atom>();
336
337                for (int i = 0; i < s.nrModels(); i++) {
338                        List<Chain> chains = s.getModel(i);
339                        extractAtoms(atomNames, chains, atoms);
340                }
341                return atoms.toArray(new Atom[atoms.size()]);
342
343        }
344
345        /**
346         * Convert all atoms of the structure (first model) into an Atom array
347         *
348         * @param s
349         *            input structure
350         * @return all atom array
351         */
352        public static final Atom[] getAllAtomArray(Structure s) {
353                List<Atom> atoms = new ArrayList<Atom>();
354
355                AtomIterator iter = new AtomIterator(s);
356                while (iter.hasNext()) {
357                        Atom a = iter.next();
358                        atoms.add(a);
359                }
360                return atoms.toArray(new Atom[atoms.size()]);
361        }
362
363        /**
364         * Returns and array of all atoms of the chain (first model), including
365         * Hydrogens (if present) and all HETATOMs. Waters are not included.
366         *
367         * @param c
368         *            input chain
369         * @return all atom array
370         */
371        public static final Atom[] getAllAtomArray(Chain c) {
372                List<Atom> atoms = new ArrayList<Atom>();
373
374                for (Group g : c.getAtomGroups()) {
375                        if (g.isWater())
376                                continue;
377                        for (Atom a : g.getAtoms()) {
378                                atoms.add(a);
379                        }
380                }
381                return atoms.toArray(new Atom[atoms.size()]);
382        }
383
384        /**
385         * List of groups from the structure not included in ca (e.g. ligands).
386         *
387         * Unaligned groups are searched from all chains referenced in ca, as well
388         * as any chains in the first model of the structure from ca[0], if any.
389         *
390         * @param ca an array of atoms
391         * @return
392         */
393        public static List<Group> getUnalignedGroups(Atom[] ca) {
394                Set<Chain> chains = new HashSet<Chain>();
395                Set<Group> caGroups = new HashSet<Group>();
396
397                // Create list of all chains in this structure
398                Structure s = null;
399                if (ca.length > 0) {
400                        Group g = ca[0].getGroup();
401                        if (g != null) {
402                                Chain c = g.getChain();
403                                if (c != null) {
404                                        s = c.getStructure();
405                                }
406                        }
407                }
408                if (s != null) {
409                        // Add all chains from the structure
410                        for (Chain c : s.getChains(0)) {
411                                chains.add(c);
412                        }
413                }
414
415                // Add groups and chains from ca
416                for (Atom a : ca) {
417                        Group g = a.getGroup();
418                        if (g != null) {
419                                caGroups.add(g);
420
421                                Chain c = g.getChain();
422                                if (c != null) {
423                                        chains.add(c);
424                                }
425                        }
426                }
427
428                // Iterate through all chains, finding groups not in ca
429                List<Group> unadded = new ArrayList<Group>();
430                for (Chain c : chains) {
431                        for (Group g : c.getAtomGroups()) {
432                                if (!caGroups.contains(g)) {
433                                        unadded.add(g);
434                                }
435                        }
436                }
437                return unadded;
438        }
439
440        /**
441         * Returns and array of all non-Hydrogen atoms in the given Structure,
442         * optionally including HET atoms or not. Waters are not included.
443         *
444         * @param s
445         * @param hetAtoms
446         *            if true HET atoms are included in array, if false they are not
447         * @return
448         */
449        public static final Atom[] getAllNonHAtomArray(Structure s, boolean hetAtoms) {
450                List<Atom> atoms = new ArrayList<Atom>();
451
452                AtomIterator iter = new AtomIterator(s);
453                while (iter.hasNext()) {
454                        Atom a = iter.next();
455                        if (a.getElement() == Element.H)
456                                continue;
457
458                        Group g = a.getGroup();
459
460                        if (g.isWater())
461                                continue;
462
463                        if (!hetAtoms && g.getType().equals(GroupType.HETATM))
464                                continue;
465
466                        atoms.add(a);
467                }
468                return atoms.toArray(new Atom[atoms.size()]);
469        }
470
471        /**
472         * Returns and array of all non-Hydrogen atoms in the given Chain,
473         * optionally including HET atoms or not Waters are not included.
474         *
475         * @param c
476         * @param hetAtoms
477         *            if true HET atoms are included in array, if false they are not
478         * @return
479         */
480        public static final Atom[] getAllNonHAtomArray(Chain c, boolean hetAtoms) {
481                List<Atom> atoms = new ArrayList<Atom>();
482
483                for (Group g : c.getAtomGroups()) {
484                        if (g.isWater())
485                                continue;
486                        for (Atom a : g.getAtoms()) {
487
488                                if (a.getElement() == Element.H)
489                                        continue;
490
491                                if (!hetAtoms && g.getType().equals(GroupType.HETATM))
492                                        continue;
493
494                                atoms.add(a);
495                        }
496                }
497                return atoms.toArray(new Atom[atoms.size()]);
498        }
499
500        /**
501         * Adds to the given atoms list, all atoms of groups that contained all
502         * requested atomNames, i.e. if a group does not contain all of the
503         * requested atom names, its atoms won't be added.
504         *
505         * @param atomNames
506         * @param chains
507         * @param atoms
508         */
509        private static void extractAtoms(String[] atomNames, List<Chain> chains,
510                        List<Atom> atoms) {
511
512                for (Chain c : chains) {
513
514                        for (Group g : c.getAtomGroups()) {
515
516                                // a temp container for the atoms of this group
517                                List<Atom> thisGroupAtoms = new ArrayList<Atom>();
518                                // flag to check if this group contains all the requested atoms.
519                                boolean thisGroupAllAtoms = true;
520                                for (String atomName : atomNames) {
521                                        Atom a = g.getAtom(atomName);
522
523                                        if (a == null) {
524                                                // this group does not have a required atom, skip it...
525                                                thisGroupAllAtoms = false;
526                                                break;
527                                        }
528                                        thisGroupAtoms.add(a);
529                                }
530                                if (thisGroupAllAtoms) {
531                                        // add the atoms of this group to the array.
532                                        for (Atom a : thisGroupAtoms) {
533                                                atoms.add(a);
534                                        }
535                                }
536
537                        }
538                }
539        }
540
541        /**
542         * Returns an array of the requested Atoms from the Chain object. Iterates
543         * over all groups and checks if the requested atoms are in this group, no
544         * matter if this is a AminoAcid or Hetatom group. If the group does not
545         * contain all requested atoms then no atoms are added for that group.
546         *
547         * @param c
548         *            the Chain to get the atoms from
549         *
550         * @param atomNames
551         *            contains the atom names to be used.
552         * @return an Atom[] array
553         */
554        public static final Atom[] getAtomArray(Chain c, String[] atomNames) {
555
556                List<Atom> atoms = new ArrayList<Atom>();
557
558                for (Group g : c.getAtomGroups()) {
559
560                        // a temp container for the atoms of this group
561                        List<Atom> thisGroupAtoms = new ArrayList<Atom>();
562                        // flag to check if this group contains all the requested atoms.
563                        boolean thisGroupAllAtoms = true;
564                        for (String atomName : atomNames) {
565                                Atom a = g.getAtom(atomName);
566                                if (a == null) {
567                                        logger.debug("Group " + g.getResidueNumber() + " ("
568                                                        + g.getPDBName()
569                                                        + ") does not have the required atom '" + atomName
570                                                        + "'");
571                                        // this group does not have a required atom, skip it...
572                                        thisGroupAllAtoms = false;
573                                        break;
574                                }
575                                thisGroupAtoms.add(a);
576                        }
577
578                        if (thisGroupAllAtoms) {
579                                // add the atoms of this group to the array.
580                                for (Atom a : thisGroupAtoms) {
581                                        atoms.add(a);
582                                }
583                        }
584
585                }
586                return atoms.toArray(new Atom[atoms.size()]);
587
588        }
589
590        /**
591         * Returns an Atom array of the C-alpha atoms. Any atom that is a carbon and
592         * has CA name will be returned.
593         *
594         * @param c
595         *            the structure object
596         * @return an Atom[] array
597         * @see #getRepresentativeAtomArray(Chain)
598         */
599        public static final Atom[] getAtomCAArray(Chain c) {
600                List<Atom> atoms = new ArrayList<Atom>();
601
602                for (Group g : c.getAtomGroups()) {
603                        if (g.hasAtom(CA_ATOM_NAME)
604                                        && g.getAtom(CA_ATOM_NAME).getElement() == Element.C) {
605                                atoms.add(g.getAtom(CA_ATOM_NAME));
606                        }
607                }
608
609                return atoms.toArray(new Atom[atoms.size()]);
610        }
611
612        /**
613         * Gets a representative atom for each group that is part of the chain
614         * backbone. Note that modified aminoacids won't be returned as part of the
615         * backbone if the {@link org.biojava.nbio.structure.io.mmcif.ReducedChemCompProvider} was used to load the
616         * structure.
617         *
618         * For amino acids, the representative is a CA carbon. For nucleotides, the
619         * representative is the {@value #NUCLEOTIDE_REPRESENTATIVE}. Other group
620         * types will be ignored.
621         *
622         * @param c
623         * @return representative Atoms of the chain backbone
624         * @since Biojava 4.1.0
625         */
626        public static final Atom[] getRepresentativeAtomArray(Chain c) {
627                List<Atom> atoms = new ArrayList<Atom>();
628
629                for (Group g : c.getAtomGroups()) {
630
631                        switch (g.getType()) {
632                        case AMINOACID:
633                                if (g.hasAtom(CA_ATOM_NAME)
634                                                && g.getAtom(CA_ATOM_NAME).getElement() == Element.C) {
635                                        atoms.add(g.getAtom(CA_ATOM_NAME));
636                                }
637                                break;
638                        case NUCLEOTIDE:
639                                if (g.hasAtom(NUCLEOTIDE_REPRESENTATIVE)) {
640                                        atoms.add(g.getAtom(NUCLEOTIDE_REPRESENTATIVE));
641                                }
642                                break;
643                        default:
644                                // don't add
645                        }
646                }
647
648                return atoms.toArray(new Atom[atoms.size()]);
649
650        }
651
652        /**
653         * Provides an equivalent copy of Atoms in a new array. Clones everything,
654         * starting with parent groups and chains. The chain will only contain
655         * groups that are part of the input array.
656         *
657         * @param ca
658         *            array of representative atoms, e.g. CA atoms
659         * @return Atom array
660         * @deprecated Use the better-named {@link #cloneAtomArray(Atom[])} instead
661         */
662        @Deprecated
663        public static final Atom[] cloneCAArray(Atom[] ca) {
664                return cloneAtomArray(ca);
665        }
666
667        /**
668         * Provides an equivalent copy of Atoms in a new array. Clones everything,
669         * starting with parent groups and chains. The chain will only contain
670         * groups that are part of the input array.
671         *
672         * @param ca
673         *            array of representative atoms, e.g. CA atoms
674         * @return Atom array
675         * @since Biojava 4.1.0
676         */
677        public static final Atom[] cloneAtomArray(Atom[] ca) {
678                Atom[] newCA = new Atom[ca.length];
679
680                List<Chain> model = new ArrayList<Chain>();
681                int apos = -1;
682                for (Atom a : ca) {
683                        apos++;
684                        Group parentG = a.getGroup();
685                        Chain parentC = parentG.getChain();
686
687                        Chain newChain = null;
688                        for (Chain c : model) {
689                                if (c.getChainID().equals(parentC.getChainID())) {
690                                        newChain = c;
691                                        break;
692                                }
693                        }
694                        if (newChain == null) {
695                                newChain = new ChainImpl();
696                                newChain.setChainID(parentC.getChainID());
697                                model.add(newChain);
698                        }
699
700                        Group parentN = (Group) parentG.clone();
701
702                        newCA[apos] = parentN.getAtom(a.getName());
703                        try {
704                                // if the group doesn't exist yet, this produces a StructureException
705                                newChain.getGroupByPDB(parentN.getResidueNumber());
706                        } catch (StructureException e) {
707                                // the group doesn't exist yet in the newChain, let's add it
708                                newChain.addGroup(parentN);
709                        }
710
711                }
712                return newCA;
713        }
714
715        /**
716         * Clone a set of representative Atoms, but returns the parent groups
717         *
718         * @param ca
719         *            Atom array
720         * @return Group array
721         */
722        public static Group[] cloneGroups(Atom[] ca) {
723                Group[] newGroup = new Group[ca.length];
724
725                List<Chain> model = new ArrayList<Chain>();
726                int apos = -1;
727                for (Atom a : ca) {
728                        apos++;
729                        Group parentG = a.getGroup();
730                        Chain parentC = parentG.getChain();
731
732                        Chain newChain = null;
733                        for (Chain c : model) {
734                                if (c.getChainID().equals(parentC.getChainID())) {
735                                        newChain = c;
736                                        break;
737                                }
738                        }
739                        if (newChain == null) {
740                                newChain = new ChainImpl();
741                                newChain.setChainID(parentC.getChainID());
742                                model.add(newChain);
743                        }
744
745                        Group ng = (Group) parentG.clone();
746                        newGroup[apos] = ng;
747                        newChain.addGroup(ng);
748                }
749                return newGroup;
750        }
751
752        /**
753         * Utility method for working with circular permutations. Creates a
754         * duplicated and cloned set of Calpha atoms from the input array.
755         *
756         * @param ca2
757         *            atom array
758         * @return cloned and duplicated set of input array
759         */
760        public static Atom[] duplicateCA2(Atom[] ca2) {
761                // we don't want to rotate input atoms, do we?
762                Atom[] ca2clone = new Atom[ca2.length * 2];
763
764                int pos = 0;
765
766                Chain c = null;
767                String prevChainId = "";
768                for (Atom a : ca2) {
769                        Group g = (Group) a.getGroup().clone(); // works because each group
770                                                                                                        // has only a single atom
771
772                        if (c == null) {
773                                c = new ChainImpl();
774                                Chain orig = a.getGroup().getChain();
775                                c.setChainID(orig.getChainID());
776                        } else {
777                                Chain orig = a.getGroup().getChain();
778                                if (!orig.getChainID().equals(prevChainId)) {
779                                        c = new ChainImpl();
780                                        c.setChainID(orig.getChainID());
781                                }
782                        }
783
784                        c.addGroup(g);
785                        ca2clone[pos] = g.getAtom(a.getName());
786
787                        pos++;
788                }
789
790                // Duplicate ca2!
791                c = null;
792                prevChainId = "";
793                for (Atom a : ca2) {
794                        Group g = (Group) a.getGroup().clone();
795
796                        if (c == null) {
797                                c = new ChainImpl();
798                                Chain orig = a.getGroup().getChain();
799                                c.setChainID(orig.getChainID());
800                        } else {
801                                Chain orig = a.getGroup().getChain();
802                                if (!orig.getChainID().equals(prevChainId)) {
803                                        c = new ChainImpl();
804                                        c.setChainID(orig.getChainID());
805                                }
806                        }
807
808                        c.addGroup(g);
809                        ca2clone[pos] = g.getAtom(a.getName());
810
811                        pos++;
812                }
813
814                return ca2clone;
815
816        }
817
818        /**
819         * Return an Atom array of the C-alpha atoms. Any atom that is a carbon and
820         * has CA name will be returned.
821         *
822         * @param s
823         *            the structure object
824         * @return an Atom[] array
825         * @see #getRepresentativeAtomArray(Structure)
826         */
827        public static Atom[] getAtomCAArray(Structure s) {
828
829                List<Atom> atoms = new ArrayList<Atom>();
830
831                for (Chain c : s.getChains()) {
832                        for (Group g : c.getAtomGroups()) {
833                                if (g.hasAtom(CA_ATOM_NAME)
834                                                && g.getAtom(CA_ATOM_NAME).getElement() == Element.C) {
835                                        atoms.add(g.getAtom(CA_ATOM_NAME));
836                                }
837                        }
838                }
839
840                return atoms.toArray(new Atom[atoms.size()]);
841        }
842
843        /**
844         * Gets a representative atom for each group that is part of the chain
845         * backbone. Note that modified aminoacids won't be returned as part of the
846         * backbone if the {@link org.biojava.nbio.structure.io.mmcif.ReducedChemCompProvider} was used to load the
847         * structure.
848         *
849         * For amino acids, the representative is a CA carbon. For nucleotides, the
850         * representative is the {@value #NUCLEOTIDE_REPRESENTATIVE}. Other group
851         * types will be ignored.
852         *
853         * @param s
854         *            Input structure
855         * @return representative Atoms of the structure backbone
856         * @since Biojava 4.1.0
857         */
858        public static Atom[] getRepresentativeAtomArray(Structure s) {
859
860                List<Atom> atoms = new ArrayList<Atom>();
861
862                for (Chain c : s.getChains()) {
863                        Atom[] chainAtoms = getRepresentativeAtomArray(c);
864                        for (Atom a : chainAtoms) {
865                                atoms.add(a);
866                        }
867                }
868
869                return atoms.toArray(new Atom[atoms.size()]);
870        }
871
872        /**
873         * Return an Atom array of the main chain atoms: CA, C, N, O Any group that
874         * contains those atoms will be included, be it a standard aminoacid or not
875         *
876         * @param s
877         *            the structure object
878         * @return an Atom[] array
879         */
880        public static Atom[] getBackboneAtomArray(Structure s) {
881
882                List<Atom> atoms = new ArrayList<Atom>();
883
884                for (Chain c : s.getChains()) {
885                        for (Group g : c.getAtomGroups()) {
886                                if (g.hasAminoAtoms()) {
887                                        // this means we will only take atoms grom groups that have
888                                        // complete backbones
889                                        for (Atom a : g.getAtoms()) {
890                                                switch (g.getType()) {
891                                                case NUCLEOTIDE:
892                                                        // Nucleotide backbone
893                                                        if (a.getName().equals(C1_ATOM_NAME))
894                                                                atoms.add(a);
895                                                        if (a.getName().equals(C2_ATOM_NAME))
896                                                                atoms.add(a);
897                                                        if (a.getName().equals(C3_ATOM_NAME))
898                                                                atoms.add(a);
899                                                        if (a.getName().equals(C4_ATOM_NAME))
900                                                                atoms.add(a);
901                                                        if (a.getName().equals(O2_ATOM_NAME))
902                                                                atoms.add(a);
903                                                        if (a.getName().equals(O3_ATOM_NAME))
904                                                                atoms.add(a);
905                                                        if (a.getName().equals(O4_ATOM_NAME))
906                                                                atoms.add(a);
907                                                        if (a.getName().equals(O5_ATOM_NAME))
908                                                                atoms.add(a);
909                                                        if (a.getName().equals(OP1_ATOM_NAME))
910                                                                atoms.add(a);
911                                                        if (a.getName().equals(OP2_ATOM_NAME))
912                                                                atoms.add(a);
913                                                        if (a.getName().equals(P_ATOM_NAME))
914                                                                atoms.add(a);
915                                                        // TODO Allow C4* names as well as C4'? -SB 3/2015
916                                                        break;
917                                                case AMINOACID:
918                                                default:
919                                                        // we do it this way instead of with g.getAtom() to
920                                                        // be sure we always use the same order as original
921                                                        if (a.getName().equals(CA_ATOM_NAME))
922                                                                atoms.add(a);
923                                                        if (a.getName().equals(C_ATOM_NAME))
924                                                                atoms.add(a);
925                                                        if (a.getName().equals(N_ATOM_NAME))
926                                                                atoms.add(a);
927                                                        if (a.getName().equals(O_ATOM_NAME))
928                                                                atoms.add(a);
929                                                        break;
930                                                }
931                                        }
932                                }
933                        }
934
935                }
936
937                return atoms.toArray(new Atom[atoms.size()]);
938        }
939
940        /**
941         * Convert three character amino acid codes into single character e.g.
942         * convert CYS to C. Valid 3-letter codes will be those of the standard 20
943         * amino acids plus MSE, CSE, SEC, PYH, PYL (see the {@link #aminoAcids}
944         * map)
945         *
946         * @return the 1 letter code, or null if the given 3 letter code does not
947         *         correspond to an amino acid code
948         * @param groupCode3
949         *            a three character amino acid representation String
950         * @see {@link #get1LetterCode(String)}
951         */
952        public static final Character get1LetterCodeAmino(String groupCode3) {
953                return aminoAcids.get(groupCode3);
954        }
955
956        /**
957         *
958         * @param code3
959         * @return
960         * @deprecated Use {@link #get1LetterCodeAmino(String)} instead
961         */
962        @Deprecated
963        public static final Character convert_3code_1code(String code3) {
964                return get1LetterCodeAmino(code3);
965        }
966
967        /**
968         * Convert a three letter amino acid or nucleotide code into a single
969         * character code. If the code does not correspond to an amino acid or
970         * nucleotide, returns {@link #UNKNOWN_GROUP_LABEL}.
971         *
972         * Returned null for nucleotides prior to version 4.0.1.
973         *
974         * @param groupCode3
975         *            three letter representation
976         * @return The 1-letter abbreviation
977         */
978        public static final Character get1LetterCode(String groupCode3) {
979
980                Character code1;
981
982                // is it a standard amino acid ?
983                code1 = get1LetterCodeAmino(groupCode3);
984
985                if (code1 == null) {
986                        // hm groupCode3 is not standard
987                        // perhaps it is a nucleotide?
988                        groupCode3 = groupCode3.trim();
989                        if (isNucleotide(groupCode3)) {
990                                code1 = nucleotides30.get(groupCode3);
991                                if (code1 == null) {
992                                        code1 = nucleotides23.get(groupCode3);
993                                }
994                                if (code1 == null) {
995                                        code1 = UNKNOWN_GROUP_LABEL;
996                                }
997                        } else {
998                                // does not seem to be so let's assume it is
999                                // nonstandard aminoacid and label it "X"
1000                                // logger.warning("unknown group name "+groupCode3 );
1001                                code1 = UNKNOWN_GROUP_LABEL;
1002                        }
1003                }
1004
1005                return code1;
1006
1007        }
1008
1009        /**
1010         * Test if the three-letter code of an ATOM entry corresponds to a
1011         * nucleotide or to an aminoacid.
1012         *
1013         * @param groupCode3
1014         *            3-character code for a group.
1015         *
1016         */
1017        public static final boolean isNucleotide(String groupCode3) {
1018                String code = groupCode3.trim();
1019                return nucleotides30.containsKey(code)
1020                                || nucleotides23.containsKey(code);
1021        }
1022
1023        /**
1024         * Reduce a structure to provide a smaller representation . Only takes the
1025         * first model of the structure. If chainId is provided only return a
1026         * structure containing that Chain ID. Converts lower case chain IDs to
1027         * upper case if structure does not contain a chain with that ID.
1028         *
1029         * @param s
1030         * @param chainId
1031         * @return Structure
1032         * @since 3.0
1033         * @deprecated Use {@link StructureIdentifier#reduce(Structure)} instead (v. 4.2.0)
1034         */
1035        @Deprecated
1036        public static final Structure getReducedStructure(Structure s,
1037                        String chainId) throws StructureException {
1038                // since we deal here with structure alignments,
1039                // only use Model 1...
1040
1041                Structure newS = new StructureImpl();
1042                newS.setPDBCode(s.getPDBCode());
1043                newS.setPDBHeader(s.getPDBHeader());
1044                newS.setName(s.getName());
1045                newS.setSSBonds(s.getSSBonds());
1046                newS.setDBRefs(s.getDBRefs());
1047                newS.setSites(s.getSites());
1048                newS.setBiologicalAssembly(s.isBiologicalAssembly());
1049                newS.setCompounds(s.getCompounds());
1050                newS.setConnections(s.getConnections());
1051                newS.setSSBonds(s.getSSBonds());
1052                newS.setSites(s.getSites());
1053
1054                if (chainId != null)
1055                        chainId = chainId.trim();
1056
1057                if (chainId == null || chainId.equals("")) {
1058                        // only get model 0
1059                        List<Chain> model0 = s.getModel(0);
1060                        for (Chain c : model0) {
1061                                newS.addChain(c);
1062                        }
1063                        return newS;
1064
1065                }
1066
1067                Chain c = null;
1068                try {
1069                        c = s.getChainByPDB(chainId);
1070                } catch (StructureException e) {
1071                        logger.warn(e.getMessage() + ". Chain id " + chainId
1072                                        + " did not match, trying upper case Chain id.");
1073                        c = s.getChainByPDB(chainId.toUpperCase());
1074
1075                }
1076                if (c != null) {
1077                        newS.addChain(c);
1078                        for (Compound comp : s.getCompounds()) {
1079                                if (comp.getChainIds() != null
1080                                                && comp.getChainIds().contains(c.getChainID())) {
1081                                        // found matching compound. set description...
1082                                        newS.getPDBHeader().setDescription(
1083                                                        "Chain " + c.getChainID() + " of " + s.getPDBCode()
1084                                                                        + " " + comp.getMolName());
1085                                }
1086                        }
1087                }
1088
1089                return newS;
1090        }
1091
1092        /**
1093         * Reduce a structure to provide a smaller representation. Only takes the
1094         * first model of the structure. If chainNr >=0 only takes the chain at that
1095         * position into account.
1096         *
1097         * @param s
1098         * @param chainNr
1099         *            can be -1 to request all chains of model 0, otherwise will
1100         *            only add chain at this position
1101         * @return Structure object
1102         * @since 3.0
1103         * @deprecated Use {@link StructureIdentifier#reduce(Structure)} instead (v. 4.2.0)
1104         */
1105        @Deprecated
1106        public static final Structure getReducedStructure(Structure s, int chainNr)
1107                        throws StructureException {
1108                // since we deal here with structure alignments,
1109                // only use Model 1...
1110
1111                Structure newS = new StructureImpl();
1112                newS.setPDBCode(s.getPDBCode());
1113                newS.setPDBHeader(s.getPDBHeader());
1114                newS.setName(s.getName());
1115                newS.setSSBonds(s.getSSBonds());
1116                newS.setDBRefs(s.getDBRefs());
1117                newS.setSites(s.getSites());
1118                newS.setBiologicalAssembly(s.isBiologicalAssembly());
1119                newS.setCompounds(s.getCompounds());
1120                newS.setConnections(s.getConnections());
1121                newS.setSSBonds(s.getSSBonds());
1122                newS.setSites(s.getSites());
1123                newS.setCrystallographicInfo(s.getCrystallographicInfo());
1124                newS.getPDBHeader().setDescription(
1125                                "subset of " + s.getPDBCode() + " "
1126                                                + s.getPDBHeader().getDescription());
1127
1128                if (chainNr < 0) {
1129
1130                        // only get model 0
1131                        List<Chain> model0 = s.getModel(0);
1132                        for (Chain c : model0) {
1133                                newS.addChain(c);
1134                        }
1135                        return newS;
1136                }
1137                Chain c = null;
1138
1139                c = s.getChain(0, chainNr);
1140
1141                newS.addChain(c);
1142
1143                return newS;
1144        }
1145
1146        /**
1147         * In addition to the functionality provided by
1148         * {@link #getReducedStructure(Structure, int)} and
1149         * {@link #getReducedStructure(Structure, String)}, also provides a way to
1150         * specify sub-regions of a structure with the following specification:
1151         *
1152         * <p>
1153         * <li>ranges can be surrounded by ( and ). (but will be removed).</li>
1154         * <li>ranges are specified as PDBresnum1 : PDBresnum2</li>
1155         *
1156         * <li>a list of ranges is separated by ,</li>
1157         * </p>
1158         * Example
1159         *
1160         * <pre>
1161         *  4GCR (A:1-83)
1162         *  1CDG (A:407-495,A:582-686)
1163         *  1CDG (A_407-495,A_582-686)
1164         * </pre>
1165         *
1166         * @param s
1167         *            The full structure
1168         * @param ranges
1169         *            A comma-separated list of ranges, optionally surrounded by
1170         *            parentheses
1171         * @return Substructure of s specified by ranges
1172         * @throws IllegalArgumentException for malformed range strings
1173         * @throws StructureException for errors when reducing the Structure
1174         * @deprecated Use {@link StructureIdentifier} instead (4.2.0)
1175         */
1176        @Deprecated
1177        public static final Structure getSubRanges(Structure s, String ranges )
1178                        throws StructureException
1179        {
1180                if (ranges == null || ranges.equals(""))
1181                        throw new IllegalArgumentException("ranges can't be null or empty");
1182
1183                ranges = ranges.trim();
1184
1185                if (ranges.startsWith("("))
1186                        ranges = ranges.substring(1);
1187                if (ranges.endsWith(")")) {
1188                        ranges = ranges.substring(0, ranges.length() - 1);
1189                }
1190
1191                // special case: '-' means 'everything'
1192                if (ranges.equals("-")) {
1193                        return s;
1194                }
1195
1196                List<ResidueRange> resRanges = ResidueRange.parseMultiple(ranges);
1197                SubstructureIdentifier structId = new SubstructureIdentifier(null,resRanges);
1198                return structId.reduce(s);
1199        }
1200
1201        public static final String convertAtomsToSeq(Atom[] atoms) {
1202
1203                StringBuilder buf = new StringBuilder();
1204                Group prevGroup = null;
1205                for (Atom a : atoms) {
1206                        Group g = a.getGroup();
1207                        if (prevGroup != null) {
1208                                if (prevGroup.equals(g)) {
1209                                        // we add each group only once.
1210                                        continue;
1211                                }
1212                        }
1213                        String code3 = g.getPDBName();
1214                        Character code1 = get1LetterCodeAmino(code3);
1215                        if (code1 == null)
1216                                code1 = UNKNOWN_GROUP_LABEL;
1217
1218                        buf.append(code1);
1219
1220                        prevGroup = g;
1221
1222                }
1223                return buf.toString();
1224        }
1225
1226        /**
1227         * Get a group represented by a ResidueNumber.
1228         *
1229         * @param struc
1230         *            a {@link Structure}
1231         * @param pdbResNum
1232         *            a {@link ResidueNumber}
1233         * @return a group in the structure that is represented by the pdbResNum.
1234         * @throws StructureException
1235         *             if the group cannot be found.
1236         */
1237        public static final Group getGroupByPDBResidueNumber(Structure struc,
1238                        ResidueNumber pdbResNum) throws StructureException {
1239                if (struc == null || pdbResNum == null) {
1240                        throw new IllegalArgumentException("Null argument(s).");
1241                }
1242
1243                Chain chain = struc.findChain(pdbResNum.getChainId());
1244
1245                return chain.getGroupByPDB(pdbResNum);
1246        }
1247
1248        /**
1249         * Returns the set of intra-chain contacts for the given chain for given
1250         * atom names, i.e. the contact map. Uses a geometric hashing algorithm that
1251         * speeds up the calculation without need of full distance matrix. The
1252         * parsing mode {@link FileParsingParameters#setAlignSeqRes(boolean)} needs
1253         * to be set to true for this to work.
1254         *
1255         * @param chain
1256         * @param atomNames
1257         *            the array with atom names to be used. Beware: CA will do both
1258         *            C-alphas an Calciums if null all non-H atoms of non-hetatoms
1259         *            will be used
1260         * @param cutoff
1261         * @return
1262         */
1263        public static AtomContactSet getAtomsInContact(Chain chain,
1264                        String[] atomNames, double cutoff) {
1265                Grid grid = new Grid(cutoff);
1266
1267                Atom[] atoms = null;
1268                if (atomNames == null) {
1269                        atoms = getAllNonHAtomArray(chain, false);
1270                } else {
1271                        atoms = getAtomArray(chain, atomNames);
1272                }
1273
1274                grid.addAtoms(atoms);
1275
1276                return grid.getContacts();
1277        }
1278
1279        /**
1280         * Returns the set of intra-chain contacts for the given chain for all non-H
1281         * atoms of non-hetatoms, i.e. the contact map. Uses a geometric hashing
1282         * algorithm that speeds up the calculation without need of full distance
1283         * matrix. The parsing mode
1284         * {@link FileParsingParameters#setAlignSeqRes(boolean)} needs to be set to
1285         * true for this to work.
1286         *
1287         * @param chain
1288         * @param cutoff
1289         * @return
1290         */
1291        public static AtomContactSet getAtomsInContact(Chain chain, double cutoff) {
1292                return getAtomsInContact(chain, (String[]) null, cutoff);
1293        }
1294
1295        /**
1296         * Returns the set of intra-chain contacts for the given chain for C-alpha
1297         * atoms (including non-standard aminoacids appearing as HETATM groups),
1298         * i.e. the contact map. Uses a geometric hashing algorithm that speeds up
1299         * the calculation without need of full distance matrix. The parsing mode
1300         * {@link FileParsingParameters#setAlignSeqRes(boolean)} needs to be set to
1301         * true for this to work.
1302         *
1303         * @param chain
1304         * @param cutoff
1305         * @return
1306         * @see {@link #getRepresentativeAtomsInContact(Chain, double)}
1307         */
1308        public static AtomContactSet getAtomsCAInContact(Chain chain, double cutoff) {
1309                Grid grid = new Grid(cutoff);
1310
1311                Atom[] atoms = getAtomCAArray(chain);
1312
1313                grid.addAtoms(atoms);
1314
1315                return grid.getContacts();
1316        }
1317
1318        /**
1319         * Returns the set of intra-chain contacts for the given chain for C-alpha
1320         * or C3' atoms (including non-standard aminoacids appearing as HETATM
1321         * groups), i.e. the contact map. Uses a geometric hashing algorithm that
1322         * speeds up the calculation without need of full distance matrix.
1323         *
1324         * @param chain
1325         * @param cutoff
1326         * @return
1327         * @since Biojava 4.1.0
1328         */
1329        public static AtomContactSet getRepresentativeAtomsInContact(Chain chain,
1330                        double cutoff) {
1331                Grid grid = new Grid(cutoff);
1332
1333                Atom[] atoms = getRepresentativeAtomArray(chain);
1334
1335                grid.addAtoms(atoms);
1336
1337                return grid.getContacts();
1338        }
1339
1340        /**
1341         * Returns the set of inter-chain contacts between the two given chains for
1342         * the given atom names. Uses a geometric hashing algorithm that speeds up
1343         * the calculation without need of full distance matrix. The parsing mode
1344         * {@link FileParsingParameters#setAlignSeqRes(boolean)} needs to be set to
1345         * true for this to work.
1346         *
1347         * @param chain1
1348         * @param chain2
1349         * @param atomNames
1350         *            the array with atom names to be used. For Calphas use {"CA"},
1351         *            if null all non-H atoms will be used. Note HET atoms are
1352         *            ignored unless this parameter is null.
1353         * @param cutoff
1354         * @param hetAtoms
1355         *            if true HET atoms are included, if false they are not
1356         * @return
1357         */
1358        public static AtomContactSet getAtomsInContact(Chain chain1, Chain chain2,
1359                        String[] atomNames, double cutoff, boolean hetAtoms) {
1360                Grid grid = new Grid(cutoff);
1361                Atom[] atoms1 = null;
1362                Atom[] atoms2 = null;
1363                if (atomNames == null) {
1364                        atoms1 = getAllNonHAtomArray(chain1, hetAtoms);
1365                        atoms2 = getAllNonHAtomArray(chain2, hetAtoms);
1366                } else {
1367                        atoms1 = getAtomArray(chain1, atomNames);
1368                        atoms2 = getAtomArray(chain2, atomNames);
1369                }
1370                grid.addAtoms(atoms1, atoms2);
1371
1372                return grid.getContacts();
1373        }
1374
1375        /**
1376         * Returns the set of inter-chain contacts between the two given chains for
1377         * all non-H atoms. Uses a geometric hashing algorithm that speeds up the
1378         * calculation without need of full distance matrix. The parsing mode
1379         * {@link FileParsingParameters#setAlignSeqRes(boolean)} needs to be set to
1380         * true for this to work.
1381         *
1382         * @param chain1
1383         * @param chain2
1384         * @param cutoff
1385         * @param hetAtoms
1386         *            if true HET atoms are included, if false they are not
1387         * @return
1388         */
1389        public static AtomContactSet getAtomsInContact(Chain chain1, Chain chain2,
1390                        double cutoff, boolean hetAtoms) {
1391                return getAtomsInContact(chain1, chain2, null, cutoff, hetAtoms);
1392        }
1393
1394        /**
1395         * Finds Groups in {@code structure} that contain at least one Atom that is
1396         * within {@code radius} Angstroms of {@code centroid}.
1397         *
1398         * @param structure
1399         *            The structure from which to find Groups
1400         * @param centroid
1401         *            The centroid of the shell
1402         * @param excludeResidues
1403         *            A list of ResidueNumbers to exclude
1404         * @param radius
1405         *            The radius from {@code centroid}, in Angstroms
1406         * @param includeWater
1407         *            Whether to include Groups whose <em>only</em> atoms are water
1408         * @param useAverageDistance
1409         *            When set to true, distances are the arithmetic mean (1-norm)
1410         *            of the distances of atoms that belong to the group and that
1411         *            are within the shell; otherwise, distances are the minimum of
1412         *            these values
1413         * @return A map of Groups within (or partially within) the shell, to their
1414         *         distances in Angstroms
1415         */
1416        public static Map<Group, Double> getGroupDistancesWithinShell(
1417                        Structure structure, Atom centroid,
1418                        Set<ResidueNumber> excludeResidues, double radius,
1419                        boolean includeWater, boolean useAverageDistance) {
1420
1421                // for speed, we avoid calculating square roots
1422                radius = radius * radius;
1423
1424                Map<Group, Double> distances = new HashMap<Group, Double>();
1425
1426                // we only need this if we're averaging distances
1427                // note that we can't use group.getAtoms().size() because some the
1428                // group's atoms be outside the shell
1429                Map<Group, Integer> atomCounts = new HashMap<Group, Integer>();
1430
1431                for (Chain chain : structure.getChains()) {
1432                        groupLoop: for (Group chainGroup : chain.getAtomGroups()) {
1433
1434                                // exclude water
1435                                if (!includeWater && chainGroup.isWater())
1436                                        continue;
1437
1438                                // check blacklist of residue numbers
1439                                for (ResidueNumber rn : excludeResidues) {
1440                                        if (rn.equals(chainGroup.getResidueNumber()))
1441                                                continue groupLoop;
1442                                }
1443
1444                                for (Atom testAtom : chainGroup.getAtoms()) {
1445
1446                                        // use getDistanceFast as we are doing a lot of comparisons
1447                                        double dist = Calc.getDistanceFast(centroid, testAtom);
1448
1449                                        // if we're the shell
1450                                        if (dist <= radius) {
1451                                                if (!distances.containsKey(chainGroup))
1452                                                        distances.put(chainGroup, Double.POSITIVE_INFINITY);
1453                                                if (useAverageDistance) {
1454                                                        // sum the distance; we'll divide by the total
1455                                                        // number later
1456                                                        // here, we CANNOT use fastDistance (distance
1457                                                        // squared) because we want the arithmetic mean
1458                                                        distances.put(chainGroup, distances.get(chainGroup)
1459                                                                        + Math.sqrt(dist));
1460                                                        if (!atomCounts.containsKey(chainGroup))
1461                                                                atomCounts.put(chainGroup, 0);
1462                                                        atomCounts.put(chainGroup,
1463                                                                        atomCounts.get(chainGroup) + 1);
1464                                                } else {
1465                                                        // take the minimum distance among all atoms of
1466                                                        // chainGroup
1467                                                        // note that we can't break here because we might
1468                                                        // find a smaller distance
1469                                                        if (dist < distances.get(chainGroup)) {
1470                                                                distances.put(chainGroup, dist);
1471                                                        }
1472                                                }
1473                                        }
1474
1475                                }
1476                        }
1477                }
1478
1479                if (useAverageDistance) {
1480                        for (Map.Entry<Group, Double> entry : distances.entrySet()) {
1481                                int count = atomCounts.get(entry.getKey());
1482                                distances.put(entry.getKey(), entry.getValue() / count);
1483                        }
1484                } else {
1485                        // in this case we used getDistanceFast
1486                        for (Map.Entry<Group, Double> entry : distances.entrySet()) {
1487                                distances.put(entry.getKey(), Math.sqrt(entry.getValue()));
1488                        }
1489                }
1490
1491                return distances;
1492
1493        }
1494
1495        public static Set<Group> getGroupsWithinShell(Structure structure,
1496                        Atom atom, Set<ResidueNumber> excludeResidues, double distance,
1497                        boolean includeWater) {
1498
1499                // square the distance to use as a comparison against getDistanceFast
1500                // which returns the square of a distance.
1501                distance = distance * distance;
1502
1503                Set<Group> returnSet = new LinkedHashSet<Group>();
1504                for (Chain chain : structure.getChains()) {
1505                        groupLoop: for (Group chainGroup : chain.getAtomGroups()) {
1506                                if (!includeWater && chainGroup.isWater())
1507                                        continue;
1508                                for (ResidueNumber rn : excludeResidues) {
1509                                        if (rn.equals(chainGroup.getResidueNumber()))
1510                                                continue groupLoop;
1511                                }
1512                                for (Atom atomB : chainGroup.getAtoms()) {
1513
1514                                        // use getDistanceFast as we are doing a lot of comparisons
1515                                        double dist = Calc.getDistanceFast(atom, atomB);
1516                                        if (dist <= distance) {
1517                                                returnSet.add(chainGroup);
1518                                                break;
1519                                        }
1520
1521                                }
1522                        }
1523                }
1524                return returnSet;
1525        }
1526
1527        /**
1528         * <p>
1529         * Returns a Set of Groups in a structure within the distance specified of a
1530         * given group.
1531         * </p>
1532         * <p>
1533         * Updated 18-Sep-2015 sroughley to return a Set so only a unique set of
1534         * Groups returned
1535         *
1536         * @param structure
1537         *            The structure to work with
1538         * @param group
1539         *            The 'query' group
1540         * @param distance
1541         *            The cutoff distance
1542         * @param includeWater
1543         *            Should water residues be included in the output?
1544         * @return {@link LinkedHashSet} of {@link Group}s within at least one atom
1545         *         with {@code distance} of at least one atom in {@code group}
1546         */
1547        public static Set<Group> getGroupsWithinShell(Structure structure,
1548                        Group group, double distance, boolean includeWater) {
1549
1550                Set<Group> returnList = new LinkedHashSet<Group>();
1551
1552                Set<ResidueNumber> excludeGroups = new HashSet<ResidueNumber>();
1553                excludeGroups.add(group.getResidueNumber());
1554                for (Atom atom : group.getAtoms()) {
1555                        Set<Group> set = getGroupsWithinShell(structure, atom,
1556                                        excludeGroups, distance, includeWater);
1557                        returnList.addAll(set);
1558                }
1559
1560                return returnList;
1561        }
1562
1563        /**
1564         * Remove all models from a Structure and keep only the first
1565         *
1566         * @param s
1567         *            original Structure
1568         * @return a structure that contains only the first model
1569         * @since 3.0.5
1570         */
1571        public static Structure removeModels(Structure s) {
1572                if (s.nrModels() == 1)
1573                        return s;
1574
1575                Structure n = new StructureImpl();
1576                // go through whole substructure and clone ...
1577
1578                // copy structure data
1579
1580                n.setPDBCode(s.getPDBCode());
1581                n.setName(s.getName());
1582
1583                // TODO: do deep copying of data!
1584                n.setPDBHeader(s.getPDBHeader());
1585                n.setDBRefs(s.getDBRefs());
1586
1587                n.setSites(s.getSites());
1588
1589                n.setChains(s.getModel(0));
1590
1591                return n;
1592
1593        }
1594
1595        /**
1596         * Removes all polymeric and solvent groups from a list of groups
1597         *
1598         */
1599        public static List<Group> filterLigands(List<Group> allGroups) {
1600
1601                List<Group> groups = new ArrayList<Group>();
1602                for (Group g : allGroups) {
1603
1604                        ChemComp cc = g.getChemComp();
1605
1606                        if (ResidueType.lPeptideLinking.equals(cc.getResidueType())
1607                                        || PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType())
1608                                        || PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc
1609                                                        .getPolymerType())) {
1610                                continue;
1611                        }
1612                        if (!g.isWater()) {
1613                                groups.add(g);
1614                        }
1615                }
1616
1617                return groups;
1618        }
1619
1620        /**
1621         * Short version of {@link #getStructure(String, PDBFileParser, AtomCache)}
1622         * which creates new parsers when needed
1623         *
1624         * @param name
1625         * @return
1626         * @throws IOException
1627         * @throws StructureException
1628         */
1629        public static Structure getStructure(String name) throws IOException,
1630                        StructureException {
1631                return StructureTools.getStructure(name, null, null);
1632        }
1633
1634        /**
1635         * Flexibly get a structure from an input String. The intent of this method
1636         * is to allow any reasonable string which could refer to a structure to be
1637         * correctly parsed. The following are currently supported:
1638         * <ol>
1639         * <li>Filename (if name refers to an existing file)
1640         * <li>PDB ID
1641         * <li>SCOP domains
1642         * <li>PDP domains
1643         * <li>Residue ranges
1644         * <li>Other formats supported by AtomCache
1645         * </ol>
1646         *
1647         * @param name
1648         *            Some reference to the protein structure
1649         * @param parser
1650         *            A clean PDBFileParser to use if it is a file. If null, a
1651         *            PDBFileParser will be instantiated if needed.
1652         * @param cache
1653         *            An AtomCache to use if the structure can be fetched from the
1654         *            PDB. If null, a AtomCache will be instantiated if needed.
1655         * @return A Structure object
1656         * @throws IOException
1657         *             if name is an existing file, but doesn't parse correctly
1658         * @throws StructureException
1659         *             if the format is unknown, or if AtomCache throws an
1660         *             exception.
1661         */
1662        public static Structure getStructure(String name, PDBFileParser parser,
1663                        AtomCache cache) throws IOException, StructureException {
1664                File f = new File(FileDownloadUtils.expandUserHome(name));
1665                if (f.exists()) {
1666                        if (parser == null) {
1667                                parser = new PDBFileParser();
1668                        }
1669                        InputStream inStream = new FileInputStream(f);
1670                        return parser.parsePDBFile(inStream);
1671                } else {
1672                        if (cache == null) {
1673                                cache = new AtomCache();
1674                        }
1675                        return cache.getStructure(name);
1676                }
1677        }
1678
1679        /**
1680         * Tell whether given chain is a protein chain
1681         *
1682         * @param c
1683         * @return true if protein, false if nucleotide or ligand
1684         * @see #getPredominantGroupType(Chain)
1685         */
1686        public static boolean isProtein(Chain c) {
1687                return getPredominantGroupType(c) == GroupType.AMINOACID;
1688        }
1689
1690        /**
1691         * Tell whether given chain is DNA or RNA
1692         *
1693         * @param c
1694         * @return true if nucleic acid, false if protein or ligand
1695         * @see #getPredominantGroupType(Chain)
1696         */
1697        public static boolean isNucleicAcid(Chain c) {
1698                return getPredominantGroupType(c) == GroupType.NUCLEOTIDE;
1699        }
1700
1701        /**
1702         * Get the predominant {@link GroupType} for a given Chain, following these
1703         * rules: <li>if the ratio of number of residues of a certain
1704         * {@link GroupType} to total non-water residues is above the threshold
1705         * {@value #RATIO_RESIDUES_TO_TOTAL}, then that {@link GroupType} is
1706         * returned</li> <li>if there is no {@link GroupType} that is above the
1707         * threshold then the {@link GroupType} with most members is chosen, logging
1708         * it</li>
1709         * <p>
1710         * See also {@link ChemComp#getPolymerType()} and
1711         * {@link ChemComp#getResidueType()} which follow the PDB chemical component
1712         * dictionary and provide a much more accurate description of groups and
1713         * their linking.
1714         * </p>
1715         *
1716         * @param c
1717         * @return
1718         */
1719        public static GroupType getPredominantGroupType(Chain c) {
1720                int sizeAminos = c.getAtomGroups(GroupType.AMINOACID).size();
1721                int sizeNucleotides = c.getAtomGroups(GroupType.NUCLEOTIDE).size();
1722                List<Group> hetAtoms = c.getAtomGroups(GroupType.HETATM);
1723                int sizeHetatoms = hetAtoms.size();
1724                int sizeWaters = 0;
1725                for (Group g : hetAtoms) {
1726                        if (g.isWater())
1727                                sizeWaters++;
1728                }
1729                int sizeHetatomsWithoutWater = sizeHetatoms - sizeWaters;
1730
1731                int fullSize = sizeAminos + sizeNucleotides + sizeHetatomsWithoutWater;
1732
1733                if ((double) sizeAminos / (double) fullSize > RATIO_RESIDUES_TO_TOTAL)
1734                        return GroupType.AMINOACID;
1735
1736                if ((double) sizeNucleotides / (double) fullSize > RATIO_RESIDUES_TO_TOTAL)
1737                        return GroupType.NUCLEOTIDE;
1738
1739                if ((double) (sizeHetatomsWithoutWater) / (double) fullSize > RATIO_RESIDUES_TO_TOTAL)
1740                        return GroupType.HETATM;
1741
1742                // finally if neither condition works, we try based on majority, but log
1743                // it
1744                GroupType max;
1745                if (sizeNucleotides > sizeAminos) {
1746                        if (sizeNucleotides > sizeHetatomsWithoutWater) {
1747                                max = GroupType.NUCLEOTIDE;
1748                        } else {
1749                                max = GroupType.HETATM;
1750                        }
1751                } else {
1752                        if (sizeAminos > sizeHetatomsWithoutWater) {
1753                                max = GroupType.AMINOACID;
1754                        } else {
1755                                max = GroupType.HETATM;
1756                        }
1757                }
1758                logger.debug(
1759                                "Ratio of residues to total for chain {} is below {}. Assuming it is a {} chain. "
1760                                                + "Counts: # aa residues: {}, # nuc residues: {}, # non-water het residues: {}, # waters: {}, "
1761                                                + "ratio aa/total: {}, ratio nuc/total: {}",
1762                                c.getChainID(), RATIO_RESIDUES_TO_TOTAL, max, sizeAminos,
1763                                sizeNucleotides, sizeHetatomsWithoutWater, sizeWaters,
1764                                (double) sizeAminos / (double) fullSize,
1765                                (double) sizeNucleotides / (double) fullSize);
1766
1767                return max;
1768        }
1769
1770        /**
1771         * Returns true if the given chain is composed of water molecules only
1772         *
1773         * @param c
1774         * @return
1775         */
1776        public static boolean isChainWaterOnly(Chain c) {
1777                boolean waterOnly = true;
1778                for (Group g : c.getAtomGroups()) {
1779                        if (!g.isWater())
1780                                waterOnly = false;
1781                        break;
1782                }
1783                return waterOnly;
1784        }
1785
1786        /**
1787         * Returns true if the given chain is composed of non-polymeric groups only
1788         *
1789         * @param c
1790         * @return
1791         */
1792        public static boolean isChainPureNonPolymer(Chain c) {
1793
1794                for (Group g : c.getAtomGroups()) {
1795                        if (g.getType() == GroupType.AMINOACID
1796                                        || g.getType() == GroupType.NUCLEOTIDE)
1797                                return false;
1798
1799                }
1800                return true;
1801        }
1802}