001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.io; 022 023import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 024import org.biojava.nbio.core.sequence.ProteinSequence; 025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 026import org.biojava.nbio.core.sequence.template.AbstractSequence; 027import org.biojava.nbio.core.sequence.template.CompoundSet; 028import org.biojava.nbio.core.sequence.template.ProxySequenceReader; 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import java.util.ArrayList; 033import java.util.Collection; 034import java.util.List; 035import java.util.Locale; 036 037/** 038 * A sequence creator which preserves the case of its input string in 039 * the user collection of the returned ProteinSequence. 040 * 041 * <p>The user collection will be the same length as the resulting ProteinSequence. 042 * Each object can be cast to a Boolean. If true, the corresponding position in 043 * the input file was uppercase. 044 * 045 * <h3>Example</h3> 046 * <code><pre>CasePreservingProteinSequenceCreator creator = 047 * new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()); 048 *AbstractSequence<AminoAcidCompound> seq = creator.getSequence("aaAA",0); 049 *System.out.println(seq.getSequenceAsString()); //"AAAA" 050 *System.out.println(seq.getUserCollection()); //"[false, false, true, true]" 051 *</code></pre> 052 */ 053public class CasePreservingProteinSequenceCreator extends ProteinSequenceCreator { 054 055 056 public CasePreservingProteinSequenceCreator( 057 CompoundSet<AminoAcidCompound> compoundSet) { 058 super(compoundSet); 059 } 060 061 /** 062 * 063 * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(org.biojava.nbio.core.sequence.template.ProxySequenceReader, long) 064 */ 065 @Override 066 public AbstractSequence<AminoAcidCompound> getSequence( 067 ProxySequenceReader<AminoAcidCompound> proxyLoader, long index) { 068 AbstractSequence<AminoAcidCompound> seq = super.getSequence(proxyLoader, index); 069 seq.setUserCollection(getStringCase(proxyLoader.getSequenceAsString())); 070 return seq; 071 } 072 073 /* (non-Javadoc) 074 * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(java.lang.String, long) 075 */ 076 @Override 077 public AbstractSequence<AminoAcidCompound> getSequence(String sequence, 078 long index) throws CompoundNotFoundException { 079 AbstractSequence<AminoAcidCompound> seq = super.getSequence(sequence.toUpperCase(Locale.ENGLISH), index); 080 seq.setUserCollection(getStringCase(sequence)); 081 return seq; 082 } 083 084 085 /** 086 * Assumes all compounds were uppercase 087 * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(java.util.List) 088 */ 089 @Override 090 public AbstractSequence<AminoAcidCompound> getSequence( 091 List<AminoAcidCompound> list) { 092 AbstractSequence<AminoAcidCompound> seq =super.getSequence(list); 093 Collection<Object> strCase = new ArrayList<Object>(seq.getLength()); 094 for(int i=0;i<seq.getLength();i++) { 095 strCase.add(true); 096 } 097 seq.setUserCollection(strCase); 098 return seq; 099 } 100 101 /** 102 * Returns a list of Booleans of the same length as the input, specifying 103 * whether each character was uppercase or not. 104 * @param str A string. Should not contain unicode supplemental characters. 105 * @return a list of Booleans of the same length as the input, specifying 106 * whether each character was uppercase or not. 107 * This list contains only Booleans. 108 */ 109 private static List<Object> getStringCase(String str) { 110 List<Object> types = new ArrayList<Object>(str.length()); 111 for(int i=0;i<str.length();i++) { 112 types.add(Character.isUpperCase(str.charAt(i))); 113 } 114 return types; 115 } 116 117 118 /** 119 * Takes a {@link ProteinSequence} which was created by a 120 * {@link CasePreservingProteinSequenceCreator}. Uses the case info 121 * stored in the user collection to modify the output array. 122 * 123 * <p>Sets elements of the output array which correspond to lowercase letters 124 * to null. 125 * 126 * @param seq Input sequence with case stored as the user collection 127 * @param out 128 */ 129 public static void setLowercaseToNull( ProteinSequence seq, 130 Object[] out) { 131 // should have been set by seq creator 132 Collection<Object> userCollection = seq.getUserCollection(); 133 if(userCollection == null) 134 throw new IllegalArgumentException("Sequence doesn't contain valid case info"); 135 if(userCollection.size() != out.length) 136 throw new IllegalArgumentException("Sequence length doesn't math output array length"); 137 138 int pos = 0; 139 for(Object isAligned : userCollection) { 140 assert(isAligned instanceof Boolean); 141 if(!(Boolean)isAligned) { 142 out[pos] = null; 143 } 144 pos++; 145 } 146 } 147}