001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.io;
022
023import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
024import org.biojava.nbio.core.sequence.ProteinSequence;
025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
027import org.biojava.nbio.core.sequence.template.AbstractSequence;
028import org.biojava.nbio.core.sequence.template.CompoundSet;
029import org.biojava.nbio.core.sequence.template.ProxySequenceReader;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import java.util.ArrayList;
034import java.util.Collection;
035import java.util.List;
036import java.util.Locale;
037
038/**
039 * A sequence creator which preserves the case of its input string in
040 * the user collection of the returned ProteinSequence.
041 *
042 * <p>The user collection will be the same length as the resulting ProteinSequence.
043 * Each object can be cast to a Boolean. If true, the corresponding position in
044 * the input file was uppercase.
045 *
046 * <h3>Example</h3>
047 * <code><pre>CasePreservingProteinSequenceCreator creator =
048 *    new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet());
049 *AbstractSequence<AminoAcidCompound> seq = creator.getSequence("aaAA",0);
050 *System.out.println(seq.getSequenceAsString()); //"AAAA"
051 *System.out.println(seq.getUserCollection()); //"[false, false, true, true]"
052 *</code></pre>
053 */
054public class CasePreservingProteinSequenceCreator extends ProteinSequenceCreator {
055
056        private final static Logger logger = LoggerFactory.getLogger(CasePreservingProteinSequenceCreator.class);
057
058        public CasePreservingProteinSequenceCreator(
059                        CompoundSet<AminoAcidCompound> compoundSet) {
060                super(compoundSet);
061        }
062
063        /**
064         *
065         * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(org.biojava.nbio.core.sequence.template.ProxySequenceReader, long)
066         */
067        @Override
068        public AbstractSequence<AminoAcidCompound> getSequence(
069                        ProxySequenceReader<AminoAcidCompound> proxyLoader, long index) {
070                AbstractSequence<AminoAcidCompound> seq = super.getSequence(proxyLoader, index);
071                seq.setUserCollection(getStringCase(proxyLoader.getSequenceAsString()));
072                return seq;
073        }
074
075        /* (non-Javadoc)
076         * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(java.lang.String, long)
077         */
078        @Override
079        public AbstractSequence<AminoAcidCompound> getSequence(String sequence,
080                        long index) throws CompoundNotFoundException {
081                AbstractSequence<AminoAcidCompound> seq = super.getSequence(sequence.toUpperCase(Locale.ENGLISH), index);
082                seq.setUserCollection(getStringCase(sequence));
083                return seq;
084        }
085
086
087        /**
088         * Assumes all compounds were uppercase
089         * @see org.biojava.nbio.core.sequence.io.ProteinSequenceCreator#getSequence(java.util.List)
090         */
091        @Override
092        public AbstractSequence<AminoAcidCompound> getSequence(
093                        List<AminoAcidCompound> list) {
094                AbstractSequence<AminoAcidCompound> seq =super.getSequence(list);
095                Collection<Object> strCase = new ArrayList<Object>(seq.getLength());
096                for(int i=0;i<seq.getLength();i++) {
097                        strCase.add(true);
098                }
099                seq.setUserCollection(strCase);
100                return seq;
101        }
102
103        /**
104         * Returns a list of Booleans of the same length as the input, specifying
105         * whether each character was uppercase or not.
106         * @param str A string. Should not contain unicode supplemental characters.
107         * @return a list of Booleans of the same length as the input, specifying
108         * whether each character was uppercase or not.
109         * This list contains only Booleans.
110         */
111        private static List<Object> getStringCase(String str) {
112                List<Object> types = new ArrayList<Object>(str.length());
113                for(int i=0;i<str.length();i++) {
114                        types.add(Character.isUpperCase(str.charAt(i)));
115                }
116                return types;
117        }
118
119        public static void main(String[] args) throws Exception {
120                CasePreservingProteinSequenceCreator creator = new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet());
121                AbstractSequence<AminoAcidCompound> seq = creator.getSequence("aaAA",0);
122                logger.info("Sequence: {}", seq.getSequenceAsString()); //"AAAA"
123                logger.info("User Collection: {}", seq.getUserCollection()); //"[false, false, true, true]"
124        }
125
126        /**
127         * Takes a {@link ProteinSequence} which was created by a
128         * {@link CasePreservingProteinSequenceCreator}. Uses the case info
129         * stored in the user collection to modify the output array.
130         *
131         * <p>Sets elements of the output array which correspond to lowercase letters
132         * to null.
133         *
134         * @param seq Input sequence with case stored as the user collection
135         * @param out
136         */
137        public static void setLowercaseToNull( ProteinSequence seq,
138                        Object[] out) {
139                // should have been set by seq creator
140                Collection<Object> userCollection = seq.getUserCollection();
141                if(userCollection == null)
142                        throw new IllegalArgumentException("Sequence doesn't contain valid case info");
143                if(userCollection.size() != out.length)
144                        throw new IllegalArgumentException("Sequence length doesn't math output array length");
145
146                int pos = 0;
147                for(Object isAligned : userCollection) {
148                        assert(isAligned instanceof Boolean);
149                        if(!(Boolean)isAligned) {
150                                out[pos] = null;
151                        }
152                        pos++;
153                }
154        }
155}