001/** 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created by Spencer Bliven 021 * 022 */ 023package org.biojava.nbio.structure.io; 024 025import org.biojava.nbio.alignment.Alignments; 026import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 027import org.biojava.nbio.alignment.SimpleGapPenalty; 028import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix; 029import org.biojava.nbio.core.alignment.template.AlignedSequence; 030import org.biojava.nbio.core.alignment.template.SequencePair; 031import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 032import org.biojava.nbio.structure.*; 033import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 034import org.biojava.nbio.core.sequence.ProteinSequence; 035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 036import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 037import org.biojava.nbio.core.sequence.template.CompoundSet; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import java.io.InputStreamReader; 042import java.util.*; 043 044 045/** 046 * A utility class with methods for matching ProteinSequences with 047 * Structures. 048 * @author Spencer Bliven 049 * 050 */ 051public class StructureSequenceMatcher { 052 053 private static final Logger logger = LoggerFactory.getLogger(StructureSequenceMatcher.class); 054 055 /** 056 * Get a substructure of {@code wholeStructure} containing only the {@link Group Groups} that are included in 057 * {@code sequence}. The resulting structure will contain only {@code ATOM} residues; the SEQ-RES will be empty. 058 * The {@link Chain Chains} of the Structure will be new instances (cloned), but the {@link Group Groups} will not. 059 * @param sequence The input protein sequence 060 * @param wholeStructure The structure from which to take a substructure 061 * @return The resulting structure 062 * @throws StructureException 063 * @see {@link #matchSequenceToStructure(ProteinSequence, Structure)} 064 */ 065 public static Structure getSubstructureMatchingProteinSequence(ProteinSequence sequence, Structure wholeStructure) { 066 ResidueNumber[] rns = matchSequenceToStructure(sequence, wholeStructure); 067 Structure structure = wholeStructure.clone(); 068 structure.setChains(new ArrayList<>()); 069// structure.getHetGroups().clear(); 070 Chain currentChain = null; 071 for (ResidueNumber rn : rns) { 072 if (rn == null) continue; 073 Group group; // note that we don't clone 074 try { 075 group = StructureTools.getGroupByPDBResidueNumber(wholeStructure, rn); 076 } catch (StructureException e) { 077 throw new IllegalArgumentException("Could not find residue " + rn + " in structure", e); 078 } 079 Chain chain = new ChainImpl(); 080 chain.setName(group.getChain().getName()); 081 chain.setId(group.getChain().getId()); 082 if (currentChain == null || !currentChain.getId().equals(chain.getId())) { 083 structure.addChain(chain); 084 chain.setEntityInfo(group.getChain().getEntityInfo()); 085 chain.setStructure(structure); 086 chain.setId(group.getChain().getId()); 087 chain.setName(group.getChain().getName()); 088 currentChain = chain; 089 } 090 currentChain.addGroup(group); 091 } 092 return structure; 093 } 094 095 /** 096 * Generates a ProteinSequence corresponding to the sequence of struct, 097 * and maintains a mapping from the sequence back to the original groups. 098 * 099 * Chains are appended to one another. 'X' is used for heteroatoms. 100 * 101 * @param struct Input structure 102 * @param groupIndexPosition An empty map, which will be populated with 103 * (residue index in returned ProteinSequence) -> (Group within struct) 104 * @return A ProteinSequence with the full sequence of struct. Chains are 105 * concatenated in the same order as the input structures 106 * 107 * @see {@link SeqRes2AtomAligner#getFullAtomSequence(List, Map)}, which 108 * does the heavy lifting. 109 * 110 */ 111 public static ProteinSequence getProteinSequenceForStructure(Structure struct, Map<Integer,Group> groupIndexPosition ) { 112 113 if( groupIndexPosition != null) { 114 groupIndexPosition.clear(); 115 } 116 117 StringBuilder seqStr = new StringBuilder(); 118 119 for(Chain chain : struct.getChains()) { 120 List<Group> groups = chain.getAtomGroups(); 121 Map<Integer,Integer> chainIndexPosition = new HashMap<Integer, Integer>(); 122 int prevLen = seqStr.length(); 123 124 // get the sequence for this chain 125 String chainSeq = SeqRes2AtomAligner.getFullAtomSequence(groups, chainIndexPosition, false); 126 seqStr.append(chainSeq); 127 128 129 // fix up the position to include previous chains, and map the value back to a Group 130 for(Integer seqIndex : chainIndexPosition.keySet()) { 131 Integer groupIndex = chainIndexPosition.get(seqIndex); 132 groupIndexPosition.put(prevLen + seqIndex, groups.get(groupIndex)); 133 } 134 } 135 136 ProteinSequence s = null; 137 try { 138 s = new ProteinSequence(seqStr.toString()); 139 } catch (CompoundNotFoundException e) { 140 // I believe this can't happen, please correct this if I'm wrong - JD 2014-10-24 141 // we can log an error if it does, it would mean there's a bad bug somewhere 142 logger.error("Could not create protein sequence, unknown compounds in string: {}", e.getMessage()); 143 } 144 145 return s; 146 } 147 148 /** 149 * Given a sequence and the corresponding Structure, get the ResidueNumber 150 * for each residue in the sequence. 151 * 152 * <p>Smith-Waterman alignment is used to match the sequences. Residues 153 * in the sequence but not the structure or mismatched between sequence 154 * and structure will have a null atom, while 155 * residues in the structure but not the sequence are ignored with a warning. 156 * @param seq The protein sequence. Should match the sequence of struct very 157 * closely. 158 * @param struct The corresponding protein structure 159 * @return A list of ResidueNumbers of the same length as seq, containing 160 * either the corresponding residue or null. 161 */ 162 public static ResidueNumber[] matchSequenceToStructure(ProteinSequence seq, Structure struct) { 163 164 //1. Create ProteinSequence for struct while remembering to which group each residue corresponds 165 Map<Integer,Group> atomIndexPosition = new HashMap<Integer, Group>(); 166 167 ProteinSequence structSeq = getProteinSequenceForStructure(struct,atomIndexPosition); 168 169 // TODO This should really be semi-global alignment, though global for either sequence OR structure (we don't know which) 170 //2. Run Smith-Waterman to get the alignment 171 // Identity substitution matrix with +1 for match, -1 for mismatch 172 // TODO 173 SubstitutionMatrix<AminoAcidCompound> matrix = 174 new SimpleSubstitutionMatrix<AminoAcidCompound>( 175 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 176 (short)1, (short)-1 ); 177 matrix = new SimpleSubstitutionMatrix<AminoAcidCompound>( 178 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 179 new InputStreamReader( 180 SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum100.txt")), 181 "blosum100"); 182 SequencePair<ProteinSequence, AminoAcidCompound> pair = 183 Alignments.getPairwiseAlignment(seq, structSeq, 184 PairwiseSequenceAlignerType.GLOBAL, new SimpleGapPenalty(), matrix); 185 186 //System.out.print(pair.toString()); 187 188 //3. Convert the alignment back to Atoms 189 AlignedSequence<ProteinSequence,AminoAcidCompound> alignedSeq = pair.getQuery(); 190 AlignedSequence<ProteinSequence,AminoAcidCompound> alignedStruct = pair.getTarget(); 191 192 193 assert(alignedSeq.getLength() == alignedStruct.getLength()); 194 195// System.out.println(pair.toString(80)); 196// System.out.format("%d/min{%d,%d}\n", pair.getNumIdenticals(), 197// alignedSeq.getLength()-alignedSeq.getNumGaps(), 198// alignedStruct.getLength()-alignedStruct.getNumGaps()); 199 200 ResidueNumber[] ca = new ResidueNumber[seq.getLength()]; 201 202 for( int pos = alignedSeq.getStart().getPosition(); pos <= alignedSeq.getEnd().getPosition(); pos++ ) { // 1-indexed 203 //skip missing residues from sequence. These probably represent alignment errors 204 if(alignedSeq.isGap(pos)) { 205 int structIndex = alignedStruct.getSequenceIndexAt(pos)-1; 206 assert(structIndex > 0);//should be defined since seq gap 207 208 Group g = atomIndexPosition.get(structIndex); 209 210 logger.warn("Chain {} residue {} in the Structure {} has no corresponding amino acid in the sequence.", 211 g.getChainId(), 212 g.getResidueNumber().toString(), 213 g.getChain().getStructure().getPDBCode() ); 214 continue; 215 } 216 217 if(! alignedStruct.isGap(pos) ) { 218 int seqIndex = alignedSeq.getSequenceIndexAt(pos)-1;//1-indexed 219 int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;//1-indexed 220 Group g = atomIndexPosition.get(structIndex); 221 222 assert(0<=seqIndex && seqIndex < ca.length); 223 224 ca[seqIndex] = g.getResidueNumber(); //remains null for gaps 225 } 226 } 227 return ca; 228 } 229 230 231 /** 232 * Removes all gaps ('-') from a protein sequence 233 * @param gapped 234 * @return 235 */ 236 public static ProteinSequence removeGaps(ProteinSequence gapped) { 237 final String[] gapStrings = {"-","."}; 238 239 StringBuilder seq = new StringBuilder(); 240 241 CompoundSet<AminoAcidCompound> aaSet = gapped.getCompoundSet(); 242 AminoAcidCompound[] gaps = new AminoAcidCompound[gapStrings.length]; 243 244 for(int i=0;i<gapStrings.length;i++) { 245 gaps[i] = aaSet.getCompoundForString(gapStrings[i]); 246 } 247 248 for(int i=1; i<=gapped.getLength();i++) { //1-indexed 249 AminoAcidCompound aa = gapped.getCompoundAt(i); 250 boolean isGap = false; 251 for(AminoAcidCompound gap : gaps) { 252 if( aa.equals(gap)) { 253 isGap = true; 254 break; 255 } 256 } 257 if(!isGap) { 258 seq.append(aa.getShortName()); 259 } 260 } 261 262 ProteinSequence ungapped = null; 263 try { 264 ungapped = new ProteinSequence(seq.toString()); 265 } catch (CompoundNotFoundException e) { 266 // this can't happen, if it does there's a bug somewhere 267 logger.error("Could not create ungapped protein sequence, found unknown compounds: {}. This is most likely a bug.", e.getMessage()); 268 } 269 270 return ungapped; 271 } 272 273 /** 274 * Creates a new list consisting of all columns of gapped where no row 275 * contained a null value. 276 * 277 * Here, "row" refers to the first index and "column" to the second, eg 278 * gapped.get(row).get(column) 279 * @param gapped A rectangular matrix containing null to mark gaps 280 * @return A new List without columns containing nulls 281 */ 282 public static <T> T[][] removeGaps(final T[][] gapped) { 283 if(gapped == null ) return null; 284 if(gapped.length < 1) return Arrays.copyOf(gapped, gapped.length); 285 286 final int nProts = gapped.length; 287 final int protLen = gapped[0].length; // length of gapped proteins 288 289 // Verify that input is rectangular 290 for(int i=0;i<nProts;i++) { 291 if(gapped[i].length != protLen) { 292 throw new IllegalArgumentException(String.format( 293 "Expected a rectangular array, but row 0 has %d elements " + 294 "while row %d has %d.", protLen,i,gapped[i].length)); 295 296 } 297 } 298 299 // determine where gaps exist in any structures 300 boolean[] isGap = new boolean[protLen]; 301 int gaps = 0; 302 for(int j=0;j<protLen;j++) { 303 for(int i=0;i<nProts;i++) { 304 if(gapped[i][j] == null ) { 305 isGap[j] = true; 306 gaps++; 307 break; //go to next position 308 } 309 } 310 } 311 312 // Create ungapped array 313 T[][] ungapped = Arrays.copyOf(gapped,nProts); 314 final int ungappedLen = protLen-gaps; 315 for(int i=0;i<nProts;i++) { 316 ungapped[i] = Arrays.copyOf(gapped[i],ungappedLen); 317 int k = 0; 318 for(int j=0;j<protLen;j++) { 319 if(!isGap[j]) { //skip gaps 320 assert(gapped[i][j] != null); 321 ungapped[i][k] = gapped[i][j]; 322 k++; 323 } 324 } 325 assert(k == ungappedLen); 326 } 327 328 return ungapped; 329 } 330}