Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.aaproperties;
022
023import org.slf4j.Logger;
024import org.slf4j.LoggerFactory;
025
026import java.util.HashSet;
027import java.util.Set;
028
029/**
030 * This is a utility class that contains utility methods which will facilitates the coding of other methods
031 *
032 * @author kohchuanhock
033 * @version 2011.08.22
034 * @since 3.0.2
035 */
036public class Utils {
037
038        private final static Logger logger = LoggerFactory.getLogger(Utils.class);
039
040        /**
041         * Returns a value with the desired number of decimal places.
042         *
043         * @param d
044         *              value to round
045         * @param c
046         *              number of decimal places desired.
047         *              Must be greater or equal to zero, otherwise, the given value d would be returned without any modification.
048         * @return
049         *              a value with the given number of decimal places.
050         */
051        public final static double roundToDecimals(double d, int c) {
052                if(c < 0) return d;
053                double p = Math.pow(10,c);
054                d = d * p;
055                double tmp = Math.round(d);
056                return tmp/p;
057        }
058
059        /**
060         * Checks if given sequence contains invalid characters. Returns true if invalid characters are found, else return false.
061         * Note that any characters are deemed as valid only if it is found in cSet.
062         *
063         * @param sequence
064         *              protein sequence to be check.
065         * @param cSet
066         *              the set of characters that are deemed valid.
067         * @return
068         *              true if invalid characters are found, else return false.
069         */
070        public final static boolean doesSequenceContainInvalidChar(String sequence, Set<Character> cSet){
071                for(char c:sequence.toCharArray()){
072                        if(!cSet.contains(c)) return true;
073                }
074                return false;
075        }
076
077        /**
078         * Return the number of invalid characters in sequence.
079         *
080         * @param sequence
081         *              protein sequence to count for invalid characters.
082         * @param cSet
083         *              the set of characters that are deemed valid.
084         * @param ignoreCase
085         *              indicates if cases should be ignored
086         * @return
087         *              the number of invalid characters in sequence.
088         */
089        public final static int getNumberOfInvalidChar(String sequence, Set<Character> cSet, boolean ignoreCase){
090                int total = 0;
091                char[] cArray;
092                if(ignoreCase) cArray = sequence.toUpperCase().toCharArray();
093                else cArray = sequence.toCharArray();
094                if(cSet == null) cSet = PeptideProperties.standardAASet;
095                for(char c:cArray){
096                        if(!cSet.contains(c)) total++;
097                }
098                return total;
099        }
100
101        /**
102         * Returns a new sequence with all invalid characters being replaced by '-'.
103         * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
104         *
105         * @param sequence
106         *              protein sequence to be clean
107         * @param cSet
108         *              user defined characters that are valid. Can be null. If null, then 20 standard protein amino acid codes will be considered as valid.
109         * @return
110         *              a new sequence with all invalid characters being replaced by '-'.
111         */
112        public final static String cleanSequence(String sequence, Set<Character> cSet){
113                Set<Character> invalidCharSet = new HashSet<Character>();
114                StringBuilder cleanSeq = new StringBuilder();
115                if(cSet == null) cSet = PeptideProperties.standardAASet;
116                for(char c:sequence.toCharArray()){
117                        if(!cSet.contains(c)){
118                                cleanSeq.append("-");
119                                invalidCharSet.add(c);
120                        }else{
121                                cleanSeq.append(c);
122                        }
123                }
124
125                // TODO: Should be StringJoiner once JDK8 used
126                StringBuilder stringBuilder = new StringBuilder();
127                for(char c: invalidCharSet){
128                        stringBuilder.append("\'" + c + "\'");
129                }
130                stringBuilder.deleteCharAt(stringBuilder.length()-1);
131                stringBuilder.append(" are being replaced with '-'");
132                logger.warn(stringBuilder.toString());
133
134                return cleanSeq.toString();
135        }
136
137        /**
138         * Checks if the sequence contains invalid characters.
139         * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
140         * If yes, it will return a new sequence where invalid characters are replaced with '-'.
141         * If no, it will simply return the input sequence.
142         *
143         * @param sequence
144         *              protein sequence to be check for invalid characters.
145         * @return
146         *              a sequence with no invalid characters.
147         */
148        public static final String checkSequence(String sequence){
149                return checkSequence(sequence, null);
150        }
151
152        /**
153         * Checks if the sequence contains invalid characters.
154         * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
155         * If yes, it will return a new sequence where invalid characters are replaced with '-'.
156         * If no, it will simply return the input sequence.
157         *
158         * @param sequence
159         *              protein sequence to be check for invalid characters.
160         * @param cSet
161         *              character set which define the valid characters.
162         * @return
163         *              a sequence with no invalid characters.
164         */
165        public static final String checkSequence(String sequence, Set<Character> cSet){
166                boolean containInvalid = false;
167                if(cSet != null){
168                        containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, cSet);
169                }else{
170                        containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, PeptideProperties.standardAASet);
171                }
172                if(containInvalid){
173                        String cSeq = cleanSequence(sequence, cSet);
174                        logger.warn("There exists invalid characters in the sequence. Computed results might not be precise.");
175                        logger.warn("To remove this warning: Please use org.biojava.nbio.aaproperties.Utils.cleanSequence to clean sequence.");
176
177                        return cSeq;
178                }
179                else{
180                        return sequence;
181                }
182        }
183}