001/** 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created by Spencer Bliven 021 * 022 */ 023package org.biojava.nbio.structure.io; 024 025import org.biojava.nbio.alignment.Alignments; 026import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 027import org.biojava.nbio.alignment.SimpleGapPenalty; 028import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix; 029import org.biojava.nbio.core.alignment.template.AlignedSequence; 030import org.biojava.nbio.core.alignment.template.SequencePair; 031import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 032import org.biojava.nbio.structure.*; 033import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 034import org.biojava.nbio.core.sequence.ProteinSequence; 035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 036import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 037import org.biojava.nbio.core.sequence.template.CompoundSet; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import java.io.InputStreamReader; 042import java.util.Arrays; 043import java.util.HashMap; 044import java.util.List; 045import java.util.Map; 046 047 048/** 049 * A utility class with methods for matching ProteinSequences with 050 * Structures. 051 * @author Spencer Bliven 052 * 053 */ 054public class StructureSequenceMatcher { 055 056 private static final Logger logger = LoggerFactory.getLogger(StructureSequenceMatcher.class); 057 058 /** 059 * Get a substructure of {@code wholeStructure} containing only the {@link Group Groups} that are included in 060 * {@code sequence}. The resulting structure will contain only {@code ATOM} residues; the SEQ-RES will be empty. 061 * The {@link Chain Chains} of the Structure will be new instances (cloned), but the {@link Group Groups} will not. 062 * @param sequence The input protein sequence 063 * @param wholeStructure The structure from which to take a substructure 064 * @return The resulting structure 065 * @throws StructureException 066 * @see {@link #matchSequenceToStructure(ProteinSequence, Structure)} 067 */ 068 public static Structure getSubstructureMatchingProteinSequence(ProteinSequence sequence, Structure wholeStructure) { 069 ResidueNumber[] rns = matchSequenceToStructure(sequence, wholeStructure); 070 Structure structure = wholeStructure.clone(); 071 structure.getChains().clear(); 072// structure.getHetGroups().clear(); 073 Chain currentChain = null; 074 for (ResidueNumber rn : rns) { 075 if (rn == null) continue; 076 Group group; // note that we don't clone 077 try { 078 group = StructureTools.getGroupByPDBResidueNumber(wholeStructure, rn); 079 } catch (StructureException e) { 080 throw new IllegalArgumentException("Could not find residue " + rn + " in structure", e); 081 } 082 Chain chain = new ChainImpl(); 083 chain.setChainID(group.getChainId()); 084 if (currentChain == null || !currentChain.getChainID().equals(chain.getChainID())) { 085 structure.addChain(chain); 086 chain.setCompound(group.getChain().getCompound()); 087 chain.setStructure(structure); 088 chain.setSwissprotId(group.getChain().getSwissprotId()); 089 chain.setInternalChainID(group.getChain().getInternalChainID()); 090 chain.setId(group.getChain().getId()); 091 currentChain = chain; 092 } 093 currentChain.addGroup(group); 094 } 095 return structure; 096 } 097 098 /** 099 * Generates a ProteinSequence corresponding to the sequence of struct, 100 * and maintains a mapping from the sequence back to the original groups. 101 * 102 * Chains are appended to one another. 'X' is used for heteroatoms. 103 * 104 * @param struct Input structure 105 * @param groupIndexPosition An empty map, which will be populated with 106 * (residue index in returned ProteinSequence) -> (Group within struct) 107 * @return A ProteinSequence with the full sequence of struct. Chains are 108 * concatenated in the same order as the input structures 109 * 110 * @see {@link SeqRes2AtomAligner#getFullAtomSequence(List, Map)}, which 111 * does the heavy lifting. 112 * 113 */ 114 public static ProteinSequence getProteinSequenceForStructure(Structure struct, Map<Integer,Group> groupIndexPosition ) { 115 116 if( groupIndexPosition != null) { 117 groupIndexPosition.clear(); 118 } 119 120 StringBuilder seqStr = new StringBuilder(); 121 122 for(Chain chain : struct.getChains()) { 123 List<Group> groups = chain.getAtomGroups(); 124 Map<Integer,Integer> chainIndexPosition = new HashMap<Integer, Integer>(); 125 int prevLen = seqStr.length(); 126 127 // get the sequence for this chain 128 String chainSeq = SeqRes2AtomAligner.getFullAtomSequence(groups, chainIndexPosition, false); 129 seqStr.append(chainSeq); 130 131 132 // fix up the position to include previous chains, and map the value back to a Group 133 for(Integer seqIndex : chainIndexPosition.keySet()) { 134 Integer groupIndex = chainIndexPosition.get(seqIndex); 135 groupIndexPosition.put(prevLen + seqIndex, groups.get(groupIndex)); 136 } 137 } 138 139 ProteinSequence s = null; 140 try { 141 s = new ProteinSequence(seqStr.toString()); 142 } catch (CompoundNotFoundException e) { 143 // I believe this can't happen, please correct this if I'm wrong - JD 2014-10-24 144 // we can log an error if it does, it would mean there's a bad bug somewhere 145 logger.error("Could not create protein sequence, unknown compounds in string: {}", e.getMessage()); 146 } 147 148 return s; 149 } 150 151 /** 152 * Given a sequence and the corresponding Structure, get the ResidueNumber 153 * for each residue in the sequence. 154 * 155 * <p>Smith-Waterman alignment is used to match the sequences. Residues 156 * in the sequence but not the structure or mismatched between sequence 157 * and structure will have a null atom, while 158 * residues in the structure but not the sequence are ignored with a warning. 159 * @param seq The protein sequence. Should match the sequence of struct very 160 * closely. 161 * @param struct The corresponding protein structure 162 * @return A list of ResidueNumbers of the same length as seq, containing 163 * either the corresponding residue or null. 164 */ 165 public static ResidueNumber[] matchSequenceToStructure(ProteinSequence seq, Structure struct) { 166 167 //1. Create ProteinSequence for struct while remembering to which group each residue corresponds 168 Map<Integer,Group> atomIndexPosition = new HashMap<Integer, Group>(); 169 170 ProteinSequence structSeq = getProteinSequenceForStructure(struct,atomIndexPosition); 171 172 // TODO This should really be semi-global alignment, though global for either sequence OR structure (we don't know which) 173 //2. Run Smith-Waterman to get the alignment 174 // Identity substitution matrix with +1 for match, -1 for mismatch 175 // TODO 176 SubstitutionMatrix<AminoAcidCompound> matrix = 177 new SimpleSubstitutionMatrix<AminoAcidCompound>( 178 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 179 (short)1, (short)-1 ); 180 matrix = new SimpleSubstitutionMatrix<AminoAcidCompound>( 181 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 182 new InputStreamReader( 183 SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum100.txt")), 184 "blosum100"); 185 SequencePair<ProteinSequence, AminoAcidCompound> pair = 186 Alignments.getPairwiseAlignment(seq, structSeq, 187 PairwiseSequenceAlignerType.GLOBAL, new SimpleGapPenalty(), matrix); 188 189 //System.out.print(pair.toString()); 190 191 //3. Convert the alignment back to Atoms 192 AlignedSequence<ProteinSequence,AminoAcidCompound> alignedSeq = pair.getQuery(); 193 AlignedSequence<ProteinSequence,AminoAcidCompound> alignedStruct = pair.getTarget(); 194 195 196 assert(alignedSeq.getLength() == alignedStruct.getLength()); 197 198// System.out.println(pair.toString(80)); 199// System.out.format("%d/min{%d,%d}\n", pair.getNumIdenticals(), 200// alignedSeq.getLength()-alignedSeq.getNumGaps(), 201// alignedStruct.getLength()-alignedStruct.getNumGaps()); 202 203 ResidueNumber[] ca = new ResidueNumber[seq.getLength()]; 204 205 for( int pos = alignedSeq.getStart().getPosition(); pos <= alignedSeq.getEnd().getPosition(); pos++ ) { // 1-indexed 206 //skip missing residues from sequence. These probably represent alignment errors 207 if(alignedSeq.isGap(pos)) { 208 int structIndex = alignedStruct.getSequenceIndexAt(pos)-1; 209 assert(structIndex > 0);//should be defined since seq gap 210 211 Group g = atomIndexPosition.get(structIndex); 212 213 logger.warn("Chain {} residue {} in the Structure {} has no corresponding amino acid in the sequence.", 214 g.getChainId(), 215 g.getResidueNumber().toString(), 216 g.getChain().getStructure().getPDBCode() ); 217 continue; 218 } 219 220 if(! alignedStruct.isGap(pos) ) { 221 int seqIndex = alignedSeq.getSequenceIndexAt(pos)-1;//1-indexed 222 int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;//1-indexed 223 Group g = atomIndexPosition.get(structIndex); 224 225 assert(0<=seqIndex && seqIndex < ca.length); 226 227 ca[seqIndex] = g.getResidueNumber(); //remains null for gaps 228 } 229 } 230 return ca; 231 } 232 233 234 /** 235 * Removes all gaps ('-') from a protein sequence 236 * @param gapped 237 * @return 238 */ 239 public static ProteinSequence removeGaps(ProteinSequence gapped) { 240 final String[] gapStrings = {"-","."}; 241 242 StringBuilder seq = new StringBuilder(); 243 244 CompoundSet<AminoAcidCompound> aaSet = gapped.getCompoundSet(); 245 AminoAcidCompound[] gaps = new AminoAcidCompound[gapStrings.length]; 246 247 for(int i=0;i<gapStrings.length;i++) { 248 gaps[i] = aaSet.getCompoundForString(gapStrings[i]); 249 } 250 251 for(int i=1; i<=gapped.getLength();i++) { //1-indexed 252 AminoAcidCompound aa = gapped.getCompoundAt(i); 253 boolean isGap = false; 254 for(AminoAcidCompound gap : gaps) { 255 if( aa.equals(gap)) { 256 isGap = true; 257 break; 258 } 259 } 260 if(!isGap) { 261 seq.append(aa.getShortName()); 262 } 263 } 264 265 ProteinSequence ungapped = null; 266 try { 267 ungapped = new ProteinSequence(seq.toString()); 268 } catch (CompoundNotFoundException e) { 269 // this can't happen, if it does there's a bug somewhere 270 logger.error("Could not create ungapped protein sequence, found unknown compounds: {}. This is most likely a bug.", e.getMessage()); 271 } 272 273 return ungapped; 274 } 275 276 /** 277 * Creates a new list consisting of all columns of gapped where no row 278 * contained a null value. 279 * 280 * Here, "row" refers to the first index and "column" to the second, eg 281 * gapped.get(row).get(column) 282 * @param gapped A rectangular matrix containing null to mark gaps 283 * @return A new List without columns containing nulls 284 */ 285 public static <T> T[][] removeGaps(final T[][] gapped) { 286 if(gapped == null ) return null; 287 if(gapped.length < 1) return Arrays.copyOf(gapped, gapped.length); 288 289 final int nProts = gapped.length; 290 final int protLen = gapped[0].length; // length of gapped proteins 291 292 // Verify that input is rectangular 293 for(int i=0;i<nProts;i++) { 294 if(gapped[i].length != protLen) { 295 throw new IllegalArgumentException(String.format( 296 "Expected a rectangular array, but row 0 has %d elements " + 297 "while row %d has %d.", protLen,i,gapped[i].length)); 298 299 } 300 } 301 302 // determine where gaps exist in any structures 303 boolean[] isGap = new boolean[protLen]; 304 int gaps = 0; 305 for(int j=0;j<protLen;j++) { 306 for(int i=0;i<nProts;i++) { 307 if(gapped[i][j] == null ) { 308 isGap[j] = true; 309 gaps++; 310 break; //go to next position 311 } 312 } 313 } 314 315 // Create ungapped array 316 T[][] ungapped = Arrays.copyOf(gapped,nProts); 317 final int ungappedLen = protLen-gaps; 318 for(int i=0;i<nProts;i++) { 319 ungapped[i] = Arrays.copyOf(gapped[i],ungappedLen); 320 int k = 0; 321 for(int j=0;j<protLen;j++) { 322 if(!isGap[j]) { //skip gaps 323 assert(gapped[i][j] != null); 324 ungapped[i][k] = gapped[i][j]; 325 k++; 326 } 327 } 328 assert(k == ungappedLen); 329 } 330 331 return ungapped; 332 } 333}