001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.symmetry.core; 022 023import org.biojava.nbio.structure.*; 024import org.biojava.nbio.structure.io.mmcif.chem.PolymerType; 025import org.biojava.nbio.structure.io.mmcif.chem.ResidueType; 026import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import java.util.ArrayList; 031import java.util.Collections; 032import java.util.List; 033 034/** 035 * Extracts information about all the chains in a structure, including 036 * chain Ids, sequences, and atoms. Includes both protein and nucleic acid chains. 037 */ 038public class ProteinChainExtractor { 039 040 private static final Logger logger = LoggerFactory.getLogger(ProteinChainExtractor.class); 041 042 private Structure structure = null; 043 private QuatSymmetryParameters parameters = null; 044 private boolean modified = true; 045 private int adjustedMinimumSequenceLength = 0; 046 047 private List<Atom[]> cAlphaTrace = new ArrayList<Atom[]>(); 048 private List<String> chainIds = new ArrayList<String>(); 049 private List<Integer> modelNumbers = new ArrayList<Integer>(); 050 private List<String> sequences = new ArrayList<String>(); 051 private int nucleicAcidChainCount = 0; 052 053 public ProteinChainExtractor(Structure structure, QuatSymmetryParameters parameters) { 054 this.structure = structure; 055 this.parameters = parameters; 056 modified = true; 057 } 058 059 public List<Atom[]> getCalphaTraces() { 060 run(); 061 return cAlphaTrace; 062 } 063 064 public List<String> getChainIds() { 065 run(); 066 return chainIds; 067 } 068 069 public List<Integer> getModelNumbers() { 070 run(); 071 return modelNumbers; 072 } 073 074 public List<String> getSequences() { 075 run(); 076 return sequences; 077 } 078 079 /** 080 * @return the nucleicAcidChainCount 081 */ 082 public int getNucleicAcidChainCount() { 083 run(); 084 return nucleicAcidChainCount; 085 } 086 087 public int getAdjustedMinimumSequenceLength() { 088 run(); 089 return adjustedMinimumSequenceLength; 090 } 091 092 private void run() { 093 if (modified) { 094 calcAdjustedMinimumSequenceLength(); 095 extractProteinChains(); 096 modified = false; 097 } 098 } 099 100 private void extractProteinChains() { 101 int models = 1; 102 if (structure.isBiologicalAssembly()) { 103 models = structure.nrModels(); 104 } 105 106 if (parameters.isVerbose()) { 107 System.out.println("Protein chains used in calculation:"); 108 System.out.println("Adjusted minimum sequence length: " + adjustedMinimumSequenceLength); 109 } 110 111 for (int i = 0; i < models; i++) { 112 for (Chain c : structure.getChains(i)) { 113 if (isNucleicAcidChain(c)) { 114 nucleicAcidChainCount++; 115 } //TODO Should we break here for DNA? If "CA" atoms are present, could cause bugs. -Spencer 9-2015 116 Atom[] ca = StructureTools.getAtomCAArray(c); 117 ca = retainStandardAminoAcidResidues(ca); 118 119 if (ca.length >= adjustedMinimumSequenceLength) { 120 if (parameters.isVerbose()) { 121 System.out.println("Chain " + c.getChainID() + " Calpha atoms: " + ca.length + " seqres: " + c.getSeqResSequence()); 122 } 123 124 cAlphaTrace.add(ca); 125 chainIds.add(c.getChainID()); 126 modelNumbers.add(i); 127 sequences.add(replaceQuestionMarks(c.getSeqResSequence())); 128 } 129 } 130 } 131 } 132 133 /** 134 * Returns an adapted minimum sequence length. This method ensure that 135 * structure that only have short chains are not excluded by the 136 * minimumSequenceLength cutoff value; 137 * @return 138 */ 139 private void calcAdjustedMinimumSequenceLength() { 140 int models = 1; 141 if (structure.isBiologicalAssembly()) { 142 models = structure.nrModels(); 143 } 144 145 int maxLength = Integer.MIN_VALUE; 146 int minLength = Integer.MAX_VALUE; 147 148 List<Integer> lengths = new ArrayList<Integer>(); 149 for (int i = 0; i < models; i++) { 150 for (Chain c : structure.getChains(i)) { 151 Atom[] ca = StructureTools.getAtomCAArray(c); 152 ca = retainStandardAminoAcidResidues(ca); 153 if (ca.length >= parameters.getAbsoluteMinimumSequenceLength()) { 154 maxLength = Math.max(ca.length, maxLength); 155 minLength = Math.min(ca.length, minLength); 156 lengths.add(ca.length); 157 } 158 } 159 } 160 161 adjustedMinimumSequenceLength = parameters.getMinimumSequenceLength(); 162 163 if (lengths.size() < 2) { 164 return; 165 } 166 167 double median = 0; 168 Collections.sort(lengths); 169 if (lengths.size() %2 == 1) { 170 int middle = (lengths.size()-1) / 2; 171 median = lengths.get(middle); 172 } else { 173 int middle2 = lengths.size()/2; 174 int middle1 = middle2-1; 175 median = 0.5 * (lengths.get(middle1) + lengths.get(middle2)); 176 } 177 178 if (minLength >= median * parameters.getMinimumSequenceLengthFraction()) { 179 adjustedMinimumSequenceLength = Math.min(minLength, parameters.getMinimumSequenceLength()); 180 } 181 } 182 183 private boolean isNucleicAcidChain(Chain chain) { 184 int count = 0; 185 for (Group group: chain.getAtomGroups()) { 186 PolymerType type = group.getChemComp().getPolymerType(); 187 if (type != null && (type.equals(PolymerType.dna) || type.equals(PolymerType.rna) || type.equals(PolymerType.dnarna))) { 188 count++; 189 } 190 } 191 return count > 3; 192 } 193 194 // In some cases "?" are in the sequence string. Example 2WS1. This is caused 195 // because the chemical component file YNM doesn't contain a one-letter code. 196 private String replaceQuestionMarks(String sequence) { 197 return sequence.replaceAll("\\?", "X"); 198 } 199 200 private Atom[] retainStandardAminoAcidResidues(Atom[] atoms) { 201 List<Atom> atomList = new ArrayList<Atom>(atoms.length); 202 for (Atom atom: atoms) { 203 Group group = atom.getGroup(); 204 if (group.getPDBName().equalsIgnoreCase("UNK")) { 205 continue; 206 } 207 if (! isAminoAcid(group)) { 208 continue; 209 } 210 atomList.add(atom); 211 } 212 return atomList.toArray(new Atom[atomList.size()]); 213 } 214 215 private boolean isAminoAcid(Group group) { 216 ChemComp cc= group.getChemComp(); 217 if (cc.getResidueType() == null) { 218 logger.warn("null residue type for: " + group.getPDBName()); 219 return false; 220 } 221 return (cc.getResidueType().equals(ResidueType.lPeptideLinking) || cc.getResidueType().equals(ResidueType.glycine)); 222 } 223}