001/** 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created by Spencer Bliven 021 * 022 */ 023package org.biojava.nbio.structure.io; 024 025import org.biojava.nbio.alignment.Alignments; 026import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 027import org.biojava.nbio.alignment.SimpleGapPenalty; 028import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix; 029import org.biojava.nbio.core.alignment.template.AlignedSequence; 030import org.biojava.nbio.core.alignment.template.SequencePair; 031import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 032import org.biojava.nbio.structure.*; 033import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 034import org.biojava.nbio.core.sequence.ProteinSequence; 035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 036import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 037import org.biojava.nbio.core.sequence.template.CompoundSet; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import java.io.InputStreamReader; 042import java.util.*; 043 044 045/** 046 * A utility class with methods for matching ProteinSequences with 047 * Structures. 048 * @author Spencer Bliven 049 * 050 */ 051public class StructureSequenceMatcher { 052 053 private static final Logger logger = LoggerFactory.getLogger(StructureSequenceMatcher.class); 054 055 /** 056 * Get a substructure of {@code wholeStructure} containing only the {@link Group Groups} that are included in 057 * {@code sequence}. The resulting structure will contain only {@code ATOM} residues; the SEQ-RES will be empty. 058 * The {@link Chain Chains} of the Structure will be new instances (cloned), but the {@link Group Groups} will not. 059 * @param sequence The input protein sequence 060 * @param wholeStructure The structure from which to take a substructure 061 * @return The resulting structure 062 * @see #matchSequenceToStructure(ProteinSequence, Structure) 063 */ 064 public static Structure getSubstructureMatchingProteinSequence(ProteinSequence sequence, Structure wholeStructure) { 065 ResidueNumber[] rns = matchSequenceToStructure(sequence, wholeStructure); 066 Structure structure = wholeStructure.clone(); 067 structure.setChains(new ArrayList<>()); 068// structure.getHetGroups().clear(); 069 Chain currentChain = null; 070 for (ResidueNumber rn : rns) { 071 if (rn == null) continue; 072 Group group; // note that we don't clone 073 try { 074 group = StructureTools.getGroupByPDBResidueNumber(wholeStructure, rn); 075 } catch (StructureException e) { 076 throw new IllegalArgumentException("Could not find residue " + rn + " in structure", e); 077 } 078 Chain chain = new ChainImpl(); 079 chain.setName(group.getChain().getName()); 080 chain.setId(group.getChain().getId()); 081 if (currentChain == null || !currentChain.getId().equals(chain.getId())) { 082 structure.addChain(chain); 083 chain.setEntityInfo(group.getChain().getEntityInfo()); 084 chain.setStructure(structure); 085 chain.setId(group.getChain().getId()); 086 chain.setName(group.getChain().getName()); 087 currentChain = chain; 088 } 089 currentChain.addGroup(group); 090 } 091 return structure; 092 } 093 094 /** 095 * Generates a ProteinSequence corresponding to the sequence of struct, 096 * and maintains a mapping from the sequence back to the original groups. 097 * 098 * Chains are appended to one another. 'X' is used for heteroatoms. 099 * 100 * @param struct Input structure 101 * @param groupIndexPosition An empty map, which will be populated with 102 * (residue index in returned ProteinSequence) -> (Group within struct) 103 * @return A ProteinSequence with the full sequence of struct. Chains are 104 * concatenated in the same order as the input structures 105 * 106 * @see SeqRes2AtomAligner#getFullAtomSequence(List, Map, boolean) 107 * 108 */ 109 public static ProteinSequence getProteinSequenceForStructure(Structure struct, Map<Integer,Group> groupIndexPosition ) { 110 111 if( groupIndexPosition != null) { 112 groupIndexPosition.clear(); 113 } 114 115 StringBuilder seqStr = new StringBuilder(); 116 117 for(Chain chain : struct.getChains()) { 118 List<Group> groups = chain.getAtomGroups(); 119 Map<Integer,Integer> chainIndexPosition = new HashMap<>(); 120 int prevLen = seqStr.length(); 121 122 // get the sequence for this chain 123 String chainSeq = SeqRes2AtomAligner.getFullAtomSequence(groups, chainIndexPosition, false); 124 seqStr.append(chainSeq); 125 126 127 // fix up the position to include previous chains, and map the value back to a Group 128 for(Integer seqIndex : chainIndexPosition.keySet()) { 129 Integer groupIndex = chainIndexPosition.get(seqIndex); 130 groupIndexPosition.put(prevLen + seqIndex, groups.get(groupIndex)); 131 } 132 } 133 134 ProteinSequence s = null; 135 try { 136 s = new ProteinSequence(seqStr.toString()); 137 } catch (CompoundNotFoundException e) { 138 // I believe this can't happen, please correct this if I'm wrong - JD 2014-10-24 139 // we can log an error if it does, it would mean there's a bad bug somewhere 140 logger.error("Could not create protein sequence, unknown compounds in string: {}", e.getMessage()); 141 } 142 143 return s; 144 } 145 146 /** 147 * Given a sequence and the corresponding Structure, get the ResidueNumber 148 * for each residue in the sequence. 149 * 150 * <p>Smith-Waterman alignment is used to match the sequences. Residues 151 * in the sequence but not the structure or mismatched between sequence 152 * and structure will have a null atom, while 153 * residues in the structure but not the sequence are ignored with a warning. 154 * @param seq The protein sequence. Should match the sequence of struct very 155 * closely. 156 * @param struct The corresponding protein structure 157 * @return A list of ResidueNumbers of the same length as seq, containing 158 * either the corresponding residue or null. 159 */ 160 public static ResidueNumber[] matchSequenceToStructure(ProteinSequence seq, Structure struct) { 161 162 //1. Create ProteinSequence for struct while remembering to which group each residue corresponds 163 Map<Integer,Group> atomIndexPosition = new HashMap<>(); 164 165 ProteinSequence structSeq = getProteinSequenceForStructure(struct,atomIndexPosition); 166 167 // TODO This should really be semi-global alignment, though global for either sequence OR structure (we don't know which) 168 //2. Run Smith-Waterman to get the alignment 169 // Identity substitution matrix with +1 for match, -1 for mismatch 170 // TODO 171 SubstitutionMatrix<AminoAcidCompound> matrix = 172 new SimpleSubstitutionMatrix<>( 173 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 174 (short)1, (short)-1 ); 175 matrix = new SimpleSubstitutionMatrix<>( 176 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 177 new InputStreamReader( 178 SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum100.txt")), 179 "blosum100"); 180 SequencePair<ProteinSequence, AminoAcidCompound> pair = 181 Alignments.getPairwiseAlignment(seq, structSeq, 182 PairwiseSequenceAlignerType.GLOBAL, new SimpleGapPenalty(), matrix); 183 184 //System.out.print(pair.toString()); 185 186 //3. Convert the alignment back to Atoms 187 AlignedSequence<ProteinSequence,AminoAcidCompound> alignedSeq = pair.getQuery(); 188 AlignedSequence<ProteinSequence,AminoAcidCompound> alignedStruct = pair.getTarget(); 189 190 191 assert(alignedSeq.getLength() == alignedStruct.getLength()); 192 193// System.out.println(pair.toString(80)); 194// System.out.format("%d/min{%d,%d}\n", pair.getNumIdenticals(), 195// alignedSeq.getLength()-alignedSeq.getNumGaps(), 196// alignedStruct.getLength()-alignedStruct.getNumGaps()); 197 198 ResidueNumber[] ca = new ResidueNumber[seq.getLength()]; 199 200 for( int pos = alignedSeq.getStart().getPosition(); pos <= alignedSeq.getEnd().getPosition(); pos++ ) { // 1-indexed 201 //skip missing residues from sequence. These probably represent alignment errors 202 if(alignedSeq.isGap(pos)) { 203 int structIndex = alignedStruct.getSequenceIndexAt(pos)-1; 204 assert(structIndex > 0);//should be defined since seq gap 205 206 Group g = atomIndexPosition.get(structIndex); 207 208 logger.warn("Chain {} residue {} in the Structure {} has no corresponding amino acid in the sequence.", 209 g.getChainId(), 210 g.getResidueNumber().toString(), 211 g.getChain().getStructure().getPDBCode() ); 212 continue; 213 } 214 215 if(! alignedStruct.isGap(pos) ) { 216 int seqIndex = alignedSeq.getSequenceIndexAt(pos)-1;//1-indexed 217 int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;//1-indexed 218 Group g = atomIndexPosition.get(structIndex); 219 220 assert(0<=seqIndex && seqIndex < ca.length); 221 222 ca[seqIndex] = g.getResidueNumber(); //remains null for gaps 223 } 224 } 225 return ca; 226 } 227 228 229 /** 230 * Removes all gaps ('-') from a protein sequence 231 * @param gapped 232 * @return 233 */ 234 public static ProteinSequence removeGaps(ProteinSequence gapped) { 235 final String[] gapStrings = {"-","."}; 236 237 StringBuilder seq = new StringBuilder(); 238 239 CompoundSet<AminoAcidCompound> aaSet = gapped.getCompoundSet(); 240 AminoAcidCompound[] gaps = new AminoAcidCompound[gapStrings.length]; 241 242 for(int i=0;i<gapStrings.length;i++) { 243 gaps[i] = aaSet.getCompoundForString(gapStrings[i]); 244 } 245 246 for(int i=1; i<=gapped.getLength();i++) { //1-indexed 247 AminoAcidCompound aa = gapped.getCompoundAt(i); 248 boolean isGap = false; 249 for(AminoAcidCompound gap : gaps) { 250 if( aa.equals(gap)) { 251 isGap = true; 252 break; 253 } 254 } 255 if(!isGap) { 256 seq.append(aa.getShortName()); 257 } 258 } 259 260 ProteinSequence ungapped = null; 261 try { 262 ungapped = new ProteinSequence(seq.toString()); 263 } catch (CompoundNotFoundException e) { 264 // this can't happen, if it does there's a bug somewhere 265 logger.error("Could not create ungapped protein sequence, found unknown compounds: {}. This is most likely a bug.", e.getMessage()); 266 } 267 268 return ungapped; 269 } 270 271 /** 272 * Creates a new list consisting of all columns of gapped where no row 273 * contained a null value. 274 * 275 * Here, "row" refers to the first index and "column" to the second, eg 276 * gapped.get(row).get(column) 277 * @param gapped A rectangular matrix containing null to mark gaps 278 * @return A new List without columns containing nulls 279 */ 280 public static <T> T[][] removeGaps(final T[][] gapped) { 281 if(gapped == null ) return null; 282 if(gapped.length < 1) return Arrays.copyOf(gapped, gapped.length); 283 284 final int nProts = gapped.length; 285 final int protLen = gapped[0].length; // length of gapped proteins 286 287 // Verify that input is rectangular 288 for(int i=0;i<nProts;i++) { 289 if(gapped[i].length != protLen) { 290 throw new IllegalArgumentException(String.format( 291 "Expected a rectangular array, but row 0 has %d elements " + 292 "while row %d has %d.", protLen,i,gapped[i].length)); 293 294 } 295 } 296 297 // determine where gaps exist in any structures 298 boolean[] isGap = new boolean[protLen]; 299 int gaps = 0; 300 for(int j=0;j<protLen;j++) { 301 for(int i=0;i<nProts;i++) { 302 if(gapped[i][j] == null ) { 303 isGap[j] = true; 304 gaps++; 305 break; //go to next position 306 } 307 } 308 } 309 310 // Create ungapped array 311 T[][] ungapped = Arrays.copyOf(gapped,nProts); 312 final int ungappedLen = protLen-gaps; 313 for(int i=0;i<nProts;i++) { 314 ungapped[i] = Arrays.copyOf(gapped[i],ungappedLen); 315 int k = 0; 316 for(int j=0;j<protLen;j++) { 317 if(!isGap[j]) { //skip gaps 318 assert(gapped[i][j] != null); 319 ungapped[i][k] = gapped[i][j]; 320 k++; 321 } 322 } 323 assert(k == ungappedLen); 324 } 325 326 return ungapped; 327 } 328}