001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.symmetry.core;
022
023import org.biojava.nbio.structure.*;
024import org.biojava.nbio.structure.io.mmcif.chem.PolymerType;
025import org.biojava.nbio.structure.io.mmcif.chem.ResidueType;
026import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import java.util.ArrayList;
031import java.util.Collections;
032import java.util.List;
033
034/**
035 * Extracts information about all the chains in a structure, including
036 * chain Ids, sequences, and atoms. Includes both protein and nucleic acid chains.
037 */
038public class ProteinChainExtractor  {
039
040        private static final Logger logger = LoggerFactory.getLogger(ProteinChainExtractor.class);
041
042        private Structure structure = null;
043        private QuatSymmetryParameters parameters = null;
044        private boolean modified = true;
045        private int adjustedMinimumSequenceLength = 0;
046
047        private List<Atom[]> cAlphaTrace = new ArrayList<Atom[]>();
048        private List<String> chainIds = new ArrayList<String>();
049        private List<Integer> modelNumbers = new ArrayList<Integer>();
050        private List<String> sequences = new ArrayList<String>();
051        private int nucleicAcidChainCount = 0;
052
053        public ProteinChainExtractor(Structure structure, QuatSymmetryParameters parameters) {
054                this.structure = structure;
055                this.parameters = parameters;
056                modified = true;
057        }
058
059        public List<Atom[]> getCalphaTraces() {
060                run();
061                return cAlphaTrace;
062        }
063
064        public List<String> getChainIds() {
065                run();
066                return chainIds;
067        }
068
069        public List<Integer> getModelNumbers() {
070                run();
071                return modelNumbers;
072        }
073
074        public List<String> getSequences() {
075                run();
076                return sequences;
077        }
078
079        /**
080         * @return the nucleicAcidChainCount
081         */
082        public int getNucleicAcidChainCount() {
083                run();
084                return nucleicAcidChainCount;
085        }
086
087        public int getAdjustedMinimumSequenceLength() {
088                run();
089                return adjustedMinimumSequenceLength;
090        }
091
092        private void run() {
093                if (modified) {
094                        calcAdjustedMinimumSequenceLength();
095                        extractProteinChains();
096                        modified = false;
097                }
098        }
099
100        private void extractProteinChains() {
101                int models = 1;
102                if (structure.isBiologicalAssembly()) {
103                        models = structure.nrModels();
104                }
105
106                if (parameters.isVerbose()) {
107                        System.out.println("Protein chains used in calculation:");
108                        System.out.println("Adjusted minimum sequence length: " + adjustedMinimumSequenceLength);
109                }
110
111                for (int i = 0; i < models; i++) {
112                        for (Chain c : structure.getChains(i)) {
113                                if (isNucleicAcidChain(c)) {
114                                        nucleicAcidChainCount++;
115                                } //TODO Should we break here for DNA? If "CA" atoms are present, could cause bugs. -Spencer 9-2015
116                                Atom[] ca = StructureTools.getAtomCAArray(c);
117                                ca = retainStandardAminoAcidResidues(ca);
118
119                                if (ca.length >= adjustedMinimumSequenceLength) {
120                                        if (parameters.isVerbose()) {
121                                                System.out.println("Chain " + c.getChainID() + " Calpha atoms: " + ca.length + " seqres: " + c.getSeqResSequence());
122                                        }
123
124                                        cAlphaTrace.add(ca);
125                                        chainIds.add(c.getChainID());
126                                        modelNumbers.add(i);
127                                        sequences.add(replaceQuestionMarks(c.getSeqResSequence()));
128                                }
129                        }
130                }
131        }
132
133        /**
134         * Returns an adapted minimum sequence length. This method ensure that
135         * structure that only have short chains are not excluded by the
136         * minimumSequenceLength cutoff value;
137         * @return
138         */
139        private void calcAdjustedMinimumSequenceLength() {
140                int models = 1;
141                if (structure.isBiologicalAssembly()) {
142                        models = structure.nrModels();
143                }
144
145                int maxLength = Integer.MIN_VALUE;
146                int minLength = Integer.MAX_VALUE;
147
148                List<Integer> lengths = new ArrayList<Integer>();
149                for (int i = 0; i < models; i++) {
150                        for (Chain c : structure.getChains(i)) {
151                                Atom[] ca = StructureTools.getAtomCAArray(c);
152                                ca = retainStandardAminoAcidResidues(ca);
153                                if (ca.length >= parameters.getAbsoluteMinimumSequenceLength()) {
154                                        maxLength = Math.max(ca.length, maxLength);
155                                        minLength = Math.min(ca.length, minLength);
156                                        lengths.add(ca.length);
157                                }
158                        }
159                }
160
161                adjustedMinimumSequenceLength = parameters.getMinimumSequenceLength();
162
163                if (lengths.size() < 2) {
164                        return;
165                }
166
167                double median = 0;
168                Collections.sort(lengths);
169                if (lengths.size() %2 == 1) {
170                        int middle = (lengths.size()-1) / 2;
171                        median = lengths.get(middle);
172                } else {
173                        int middle2 = lengths.size()/2;
174                        int middle1 = middle2-1;
175                        median = 0.5 * (lengths.get(middle1) + lengths.get(middle2));
176                }
177
178                if (minLength >= median * parameters.getMinimumSequenceLengthFraction()) {
179                        adjustedMinimumSequenceLength = Math.min(minLength, parameters.getMinimumSequenceLength());
180                }
181        }
182
183        private boolean isNucleicAcidChain(Chain chain) {
184                int count = 0;
185                for (Group group: chain.getAtomGroups()) {
186                        PolymerType type = group.getChemComp().getPolymerType();
187                        if (type != null && (type.equals(PolymerType.dna) || type.equals(PolymerType.rna) || type.equals(PolymerType.dnarna))) {
188                                count++;
189                        }
190                }
191                return count > 3;
192        }
193
194        // In some cases "?" are in the sequence string. Example 2WS1. This is caused
195        // because the chemical component file YNM doesn't contain a one-letter code.
196        private String replaceQuestionMarks(String sequence) {
197                return sequence.replaceAll("\\?", "X");
198        }
199
200        private Atom[] retainStandardAminoAcidResidues(Atom[] atoms) {
201                List<Atom> atomList = new ArrayList<Atom>(atoms.length);
202                for (Atom atom: atoms) {
203                        Group group = atom.getGroup();
204                        if (group.getPDBName().equalsIgnoreCase("UNK")) {
205                                continue;
206                        }
207                        if (! isAminoAcid(group)) {
208                                continue;
209                        }
210                        atomList.add(atom);
211                }
212                return atomList.toArray(new Atom[atomList.size()]);
213        }
214
215        private boolean isAminoAcid(Group group) {
216                ChemComp cc= group.getChemComp();
217                if (cc.getResidueType() == null) {
218                        logger.warn("null residue type for: " + group.getPDBName());
219                        return false;
220                }
221                return (cc.getResidueType().equals(ResidueType.lPeptideLinking) || cc.getResidueType().equals(ResidueType.glycine));
222        }
223}