001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.io; 022 023import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 024import org.biojava.nbio.core.sequence.ProteinSequence; 025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 026import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 027import org.biojava.nbio.core.sequence.template.AbstractSequence; 028import org.biojava.nbio.core.sequence.template.CompoundSet; 029import org.biojava.nbio.core.sequence.template.ProxySequenceReader; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import java.util.ArrayList; 034import java.util.Collection; 035import java.util.List; 036import java.util.Locale; 037 038/** 039 * A sequence creator which preserves the case of its input string in 040 * the user collection of the returned ProteinSequence. 041 * 042 * <p>The user collection will be the same length as the resulting ProteinSequence. 043 * Each object can be cast to a Boolean. If true, the corresponding position in 044 * the input file was uppercase. 045 * 046 * <h3>Example</h3> 047 * <code><pre>CasePreservingProteinSequenceCreator creator = 048 * new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()); 049 *AbstractSequence<AminoAcidCompound> seq = creator.getSequence("aaAA",0); 050 *System.out.println(seq.getSequenceAsString()); //"AAAA" 051 *System.out.println(seq.getUserCollection()); //"[false, false, true, true]" 052 *</code></pre> 053 */ 054public class CasePreservingProteinSequenceCreator extends ProteinSequenceCreator { 055 056 private final static Logger logger = LoggerFactory.getLogger(CasePreservingProteinSequenceCreator.class); 057 058 public CasePreservingProteinSequenceCreator( 059 CompoundSet<AminoAcidCompound> compoundSet) { 060 super(compoundSet); 061 } 062 063 /** 064 * 065 * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(org.biojava.nbio.core.sequence.template.ProxySequenceReader, long) 066 */ 067 @Override 068 public AbstractSequence<AminoAcidCompound> getSequence( 069 ProxySequenceReader<AminoAcidCompound> proxyLoader, long index) { 070 AbstractSequence<AminoAcidCompound> seq = super.getSequence(proxyLoader, index); 071 seq.setUserCollection(getStringCase(proxyLoader.getSequenceAsString())); 072 return seq; 073 } 074 075 /* (non-Javadoc) 076 * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(java.lang.String, long) 077 */ 078 @Override 079 public AbstractSequence<AminoAcidCompound> getSequence(String sequence, 080 long index) throws CompoundNotFoundException { 081 AbstractSequence<AminoAcidCompound> seq = super.getSequence(sequence.toUpperCase(Locale.ENGLISH), index); 082 seq.setUserCollection(getStringCase(sequence)); 083 return seq; 084 } 085 086 087 /** 088 * Assumes all compounds were uppercase 089 * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(java.util.List) 090 */ 091 @Override 092 public AbstractSequence<AminoAcidCompound> getSequence( 093 List<AminoAcidCompound> list) { 094 AbstractSequence<AminoAcidCompound> seq =super.getSequence(list); 095 Collection<Object> strCase = new ArrayList<Object>(seq.getLength()); 096 for(int i=0;i<seq.getLength();i++) { 097 strCase.add(true); 098 } 099 seq.setUserCollection(strCase); 100 return seq; 101 } 102 103 /** 104 * Returns a list of Booleans of the same length as the input, specifying 105 * whether each character was uppercase or not. 106 * @param str A string. Should not contain unicode supplemental characters. 107 * @return a list of Booleans of the same length as the input, specifying 108 * whether each character was uppercase or not. 109 * This list contains only Booleans. 110 */ 111 private static List<Object> getStringCase(String str) { 112 List<Object> types = new ArrayList<Object>(str.length()); 113 for(int i=0;i<str.length();i++) { 114 types.add(Character.isUpperCase(str.charAt(i))); 115 } 116 return types; 117 } 118 119 public static void main(String[] args) throws Exception { 120 CasePreservingProteinSequenceCreator creator = new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()); 121 AbstractSequence<AminoAcidCompound> seq = creator.getSequence("aaAA",0); 122 logger.info("Sequence: {}", seq.getSequenceAsString()); //"AAAA" 123 logger.info("User Collection: {}", seq.getUserCollection()); //"[false, false, true, true]" 124 } 125 126 /** 127 * Takes a {@link ProteinSequence} which was created by a 128 * {@link CasePreservingProteinSequenceCreator}. Uses the case info 129 * stored in the user collection to modify the output array. 130 * 131 * <p>Sets elements of the output array which correspond to lowercase letters 132 * to null. 133 * 134 * @param seq Input sequence with case stored as the user collection 135 * @param out 136 */ 137 public static void setLowercaseToNull( ProteinSequence seq, 138 Object[] out) { 139 // should have been set by seq creator 140 Collection<Object> userCollection = seq.getUserCollection(); 141 if(userCollection == null) 142 throw new IllegalArgumentException("Sequence doesn't contain valid case info"); 143 if(userCollection.size() != out.length) 144 throw new IllegalArgumentException("Sequence length doesn't math output array length"); 145 146 int pos = 0; 147 for(Object isAligned : userCollection) { 148 assert(isAligned instanceof Boolean); 149 if(!(Boolean)isAligned) { 150 out[pos] = null; 151 } 152 pos++; 153 } 154 } 155}