001/** 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created by Spencer Bliven 021 * 022 */ 023package org.biojava.nbio.structure.io; 024 025import org.biojava.nbio.alignment.Alignments; 026import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 027import org.biojava.nbio.alignment.SimpleGapPenalty; 028import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix; 029import org.biojava.nbio.core.alignment.template.AlignedSequence; 030import org.biojava.nbio.core.alignment.template.SequencePair; 031import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 032import org.biojava.nbio.structure.*; 033import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 034import org.biojava.nbio.core.sequence.ProteinSequence; 035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 036import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 037import org.biojava.nbio.core.sequence.template.CompoundSet; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import java.io.InputStreamReader; 042import java.util.*; 043 044 045/** 046 * A utility class with methods for matching ProteinSequences with 047 * Structures. 048 * @author Spencer Bliven 049 * 050 */ 051public class StructureSequenceMatcher { 052 053 private static final Logger logger = LoggerFactory.getLogger(StructureSequenceMatcher.class); 054 055 /** 056 * Get a substructure of {@code wholeStructure} containing only the {@link Group Groups} that are included in 057 * {@code sequence}. The resulting structure will contain only {@code ATOM} residues; the SEQ-RES will be empty. 058 * The {@link Chain Chains} of the Structure will be new instances (cloned), but the {@link Group Groups} will not. 059 * @param sequence The input protein sequence 060 * @param wholeStructure The structure from which to take a substructure 061 * @return The resulting structure 062 * @throws StructureException 063 * @see {@link #matchSequenceToStructure(ProteinSequence, Structure)} 064 */ 065 public static Structure getSubstructureMatchingProteinSequence(ProteinSequence sequence, Structure wholeStructure) { 066 ResidueNumber[] rns = matchSequenceToStructure(sequence, wholeStructure); 067 Structure structure = wholeStructure.clone(); 068 structure.setChains(new ArrayList<>()); 069// structure.getHetGroups().clear(); 070 Chain currentChain = null; 071 for (ResidueNumber rn : rns) { 072 if (rn == null) continue; 073 Group group; // note that we don't clone 074 try { 075 group = StructureTools.getGroupByPDBResidueNumber(wholeStructure, rn); 076 } catch (StructureException e) { 077 throw new IllegalArgumentException("Could not find residue " + rn + " in structure", e); 078 } 079 Chain chain = new ChainImpl(); 080 chain.setName(group.getChain().getName()); 081 chain.setId(group.getChain().getId()); 082 if (currentChain == null || !currentChain.getId().equals(chain.getId())) { 083 structure.addChain(chain); 084 chain.setEntityInfo(group.getChain().getEntityInfo()); 085 chain.setStructure(structure); 086 chain.setSwissprotId(group.getChain().getSwissprotId()); 087 chain.setId(group.getChain().getId()); 088 chain.setName(group.getChain().getName()); 089 currentChain = chain; 090 } 091 currentChain.addGroup(group); 092 } 093 return structure; 094 } 095 096 /** 097 * Generates a ProteinSequence corresponding to the sequence of struct, 098 * and maintains a mapping from the sequence back to the original groups. 099 * 100 * Chains are appended to one another. 'X' is used for heteroatoms. 101 * 102 * @param struct Input structure 103 * @param groupIndexPosition An empty map, which will be populated with 104 * (residue index in returned ProteinSequence) -> (Group within struct) 105 * @return A ProteinSequence with the full sequence of struct. Chains are 106 * concatenated in the same order as the input structures 107 * 108 * @see {@link SeqRes2AtomAligner#getFullAtomSequence(List, Map)}, which 109 * does the heavy lifting. 110 * 111 */ 112 public static ProteinSequence getProteinSequenceForStructure(Structure struct, Map<Integer,Group> groupIndexPosition ) { 113 114 if( groupIndexPosition != null) { 115 groupIndexPosition.clear(); 116 } 117 118 StringBuilder seqStr = new StringBuilder(); 119 120 for(Chain chain : struct.getChains()) { 121 List<Group> groups = chain.getAtomGroups(); 122 Map<Integer,Integer> chainIndexPosition = new HashMap<Integer, Integer>(); 123 int prevLen = seqStr.length(); 124 125 // get the sequence for this chain 126 String chainSeq = SeqRes2AtomAligner.getFullAtomSequence(groups, chainIndexPosition, false); 127 seqStr.append(chainSeq); 128 129 130 // fix up the position to include previous chains, and map the value back to a Group 131 for(Integer seqIndex : chainIndexPosition.keySet()) { 132 Integer groupIndex = chainIndexPosition.get(seqIndex); 133 groupIndexPosition.put(prevLen + seqIndex, groups.get(groupIndex)); 134 } 135 } 136 137 ProteinSequence s = null; 138 try { 139 s = new ProteinSequence(seqStr.toString()); 140 } catch (CompoundNotFoundException e) { 141 // I believe this can't happen, please correct this if I'm wrong - JD 2014-10-24 142 // we can log an error if it does, it would mean there's a bad bug somewhere 143 logger.error("Could not create protein sequence, unknown compounds in string: {}", e.getMessage()); 144 } 145 146 return s; 147 } 148 149 /** 150 * Given a sequence and the corresponding Structure, get the ResidueNumber 151 * for each residue in the sequence. 152 * 153 * <p>Smith-Waterman alignment is used to match the sequences. Residues 154 * in the sequence but not the structure or mismatched between sequence 155 * and structure will have a null atom, while 156 * residues in the structure but not the sequence are ignored with a warning. 157 * @param seq The protein sequence. Should match the sequence of struct very 158 * closely. 159 * @param struct The corresponding protein structure 160 * @return A list of ResidueNumbers of the same length as seq, containing 161 * either the corresponding residue or null. 162 */ 163 public static ResidueNumber[] matchSequenceToStructure(ProteinSequence seq, Structure struct) { 164 165 //1. Create ProteinSequence for struct while remembering to which group each residue corresponds 166 Map<Integer,Group> atomIndexPosition = new HashMap<Integer, Group>(); 167 168 ProteinSequence structSeq = getProteinSequenceForStructure(struct,atomIndexPosition); 169 170 // TODO This should really be semi-global alignment, though global for either sequence OR structure (we don't know which) 171 //2. Run Smith-Waterman to get the alignment 172 // Identity substitution matrix with +1 for match, -1 for mismatch 173 // TODO 174 SubstitutionMatrix<AminoAcidCompound> matrix = 175 new SimpleSubstitutionMatrix<AminoAcidCompound>( 176 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 177 (short)1, (short)-1 ); 178 matrix = new SimpleSubstitutionMatrix<AminoAcidCompound>( 179 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 180 new InputStreamReader( 181 SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum100.txt")), 182 "blosum100"); 183 SequencePair<ProteinSequence, AminoAcidCompound> pair = 184 Alignments.getPairwiseAlignment(seq, structSeq, 185 PairwiseSequenceAlignerType.GLOBAL, new SimpleGapPenalty(), matrix); 186 187 //System.out.print(pair.toString()); 188 189 //3. Convert the alignment back to Atoms 190 AlignedSequence<ProteinSequence,AminoAcidCompound> alignedSeq = pair.getQuery(); 191 AlignedSequence<ProteinSequence,AminoAcidCompound> alignedStruct = pair.getTarget(); 192 193 194 assert(alignedSeq.getLength() == alignedStruct.getLength()); 195 196// System.out.println(pair.toString(80)); 197// System.out.format("%d/min{%d,%d}\n", pair.getNumIdenticals(), 198// alignedSeq.getLength()-alignedSeq.getNumGaps(), 199// alignedStruct.getLength()-alignedStruct.getNumGaps()); 200 201 ResidueNumber[] ca = new ResidueNumber[seq.getLength()]; 202 203 for( int pos = alignedSeq.getStart().getPosition(); pos <= alignedSeq.getEnd().getPosition(); pos++ ) { // 1-indexed 204 //skip missing residues from sequence. These probably represent alignment errors 205 if(alignedSeq.isGap(pos)) { 206 int structIndex = alignedStruct.getSequenceIndexAt(pos)-1; 207 assert(structIndex > 0);//should be defined since seq gap 208 209 Group g = atomIndexPosition.get(structIndex); 210 211 logger.warn("Chain {} residue {} in the Structure {} has no corresponding amino acid in the sequence.", 212 g.getChainId(), 213 g.getResidueNumber().toString(), 214 g.getChain().getStructure().getPDBCode() ); 215 continue; 216 } 217 218 if(! alignedStruct.isGap(pos) ) { 219 int seqIndex = alignedSeq.getSequenceIndexAt(pos)-1;//1-indexed 220 int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;//1-indexed 221 Group g = atomIndexPosition.get(structIndex); 222 223 assert(0<=seqIndex && seqIndex < ca.length); 224 225 ca[seqIndex] = g.getResidueNumber(); //remains null for gaps 226 } 227 } 228 return ca; 229 } 230 231 232 /** 233 * Removes all gaps ('-') from a protein sequence 234 * @param gapped 235 * @return 236 */ 237 public static ProteinSequence removeGaps(ProteinSequence gapped) { 238 final String[] gapStrings = {"-","."}; 239 240 StringBuilder seq = new StringBuilder(); 241 242 CompoundSet<AminoAcidCompound> aaSet = gapped.getCompoundSet(); 243 AminoAcidCompound[] gaps = new AminoAcidCompound[gapStrings.length]; 244 245 for(int i=0;i<gapStrings.length;i++) { 246 gaps[i] = aaSet.getCompoundForString(gapStrings[i]); 247 } 248 249 for(int i=1; i<=gapped.getLength();i++) { //1-indexed 250 AminoAcidCompound aa = gapped.getCompoundAt(i); 251 boolean isGap = false; 252 for(AminoAcidCompound gap : gaps) { 253 if( aa.equals(gap)) { 254 isGap = true; 255 break; 256 } 257 } 258 if(!isGap) { 259 seq.append(aa.getShortName()); 260 } 261 } 262 263 ProteinSequence ungapped = null; 264 try { 265 ungapped = new ProteinSequence(seq.toString()); 266 } catch (CompoundNotFoundException e) { 267 // this can't happen, if it does there's a bug somewhere 268 logger.error("Could not create ungapped protein sequence, found unknown compounds: {}. This is most likely a bug.", e.getMessage()); 269 } 270 271 return ungapped; 272 } 273 274 /** 275 * Creates a new list consisting of all columns of gapped where no row 276 * contained a null value. 277 * 278 * Here, "row" refers to the first index and "column" to the second, eg 279 * gapped.get(row).get(column) 280 * @param gapped A rectangular matrix containing null to mark gaps 281 * @return A new List without columns containing nulls 282 */ 283 public static <T> T[][] removeGaps(final T[][] gapped) { 284 if(gapped == null ) return null; 285 if(gapped.length < 1) return Arrays.copyOf(gapped, gapped.length); 286 287 final int nProts = gapped.length; 288 final int protLen = gapped[0].length; // length of gapped proteins 289 290 // Verify that input is rectangular 291 for(int i=0;i<nProts;i++) { 292 if(gapped[i].length != protLen) { 293 throw new IllegalArgumentException(String.format( 294 "Expected a rectangular array, but row 0 has %d elements " + 295 "while row %d has %d.", protLen,i,gapped[i].length)); 296 297 } 298 } 299 300 // determine where gaps exist in any structures 301 boolean[] isGap = new boolean[protLen]; 302 int gaps = 0; 303 for(int j=0;j<protLen;j++) { 304 for(int i=0;i<nProts;i++) { 305 if(gapped[i][j] == null ) { 306 isGap[j] = true; 307 gaps++; 308 break; //go to next position 309 } 310 } 311 } 312 313 // Create ungapped array 314 T[][] ungapped = Arrays.copyOf(gapped,nProts); 315 final int ungappedLen = protLen-gaps; 316 for(int i=0;i<nProts;i++) { 317 ungapped[i] = Arrays.copyOf(gapped[i],ungappedLen); 318 int k = 0; 319 for(int j=0;j<protLen;j++) { 320 if(!isGap[j]) { //skip gaps 321 assert(gapped[i][j] != null); 322 ungapped[i][k] = gapped[i][j]; 323 k++; 324 } 325 } 326 assert(k == ungappedLen); 327 } 328 329 return ungapped; 330 } 331}