001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.aaproperties;
022
023import org.slf4j.Logger;
024import org.slf4j.LoggerFactory;
025
026import java.nio.CharBuffer;
027import java.util.HashSet;
028import java.util.Set;
029
030/**
031 * This is a utility class that contains utility methods which will facilitates the coding of other methods
032 *
033 * @author kohchuanhock
034 * @version 2011.08.22
035 * @since 3.0.2
036 */
037public class Utils {
038
039        private final static Logger logger = LoggerFactory.getLogger(Utils.class);
040
041        /**
042         * Returns a value with the desired number of decimal places.
043         *
044         * @param d
045         *              value to round
046         * @param c
047         *              number of decimal places desired.
048         *              Must be greater or equal to zero, otherwise, the given value d would be returned without any modification.
049         * @return
050         *              a value with the given number of decimal places.
051         */
052        public final static double roundToDecimals(double d, int c) {
053                if(c < 0) return d;
054                double p = Math.pow(10,c);
055                d = d * p;
056                double tmp = Math.round(d);
057                return tmp/p;
058        }
059
060        /**
061         * Checks if given sequence contains invalid characters. Returns true if invalid characters are found, else return false.
062         * Note that any characters are deemed as valid only if it is found in cSet.
063         *
064         * @param sequence
065         *              protein sequence to be check.
066         * @param cSet
067         *              the set of characters that are deemed valid.
068         * @return
069         *              true if invalid characters are found, else return false.
070         */
071        public final static boolean doesSequenceContainInvalidChar(String sequence, Set<Character> cSet){
072                                for(char c:sequence.toCharArray()){
073                                        if(!cSet.contains(c)) return true;
074                                }
075                                return false;
076        }
077
078        /**
079         * Return the number of invalid characters in sequence.
080         *
081         * @param sequence
082         *              protein sequence to count for invalid characters.
083         * @param cSet
084         *              the set of characters that are deemed valid.
085         * @param ignoreCase
086         *              indicates if cases should be ignored
087         * @return
088         *              the number of invalid characters in sequence.
089         */
090        public final static int getNumberOfInvalidChar(String sequence,  Set<Character> cSet, boolean ignoreCase){
091                char[] cArray = ignoreCase ? sequence.toUpperCase().toCharArray(): sequence.toCharArray();
092                final Set<Character> characterSet = cSet == null ?PeptideProperties.standardAASet: cSet ; 
093                int total = (int)CharBuffer.wrap(cArray).chars().filter(character -> !characterSet.contains((char)character)).count();
094                return total;
095        }
096
097        /**
098         * Returns a new sequence with all invalid characters being replaced by '-'.
099         * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
100         *
101         * @param sequence
102         *              protein sequence to be clean
103         * @param cSet
104         *              user defined characters that are valid. Can be null. If null, then 20 standard protein amino acid codes will be considered as valid.
105         * @return
106         *              a new sequence with all invalid characters being replaced by '-'.
107         */
108        public final static String cleanSequence(String sequence, Set<Character> cSet){
109                Set<Character> invalidCharSet = new HashSet<>();
110                StringBuilder cleanSeq = new StringBuilder();
111                if(cSet == null) cSet = PeptideProperties.standardAASet;
112                for(char c:sequence.toCharArray()){
113                        if(!cSet.contains(c)){
114                                cleanSeq.append("-");
115                                invalidCharSet.add(c);
116                        }else{
117                                cleanSeq.append(c);
118                        }
119                }
120
121                // TODO: Should be StringJoiner once JDK8 used
122                StringBuilder stringBuilder = new StringBuilder();
123                for(char c: invalidCharSet){
124                        stringBuilder.append("\'" + c + "\'");
125                }
126                stringBuilder.deleteCharAt(stringBuilder.length()-1);
127                stringBuilder.append(" are being replaced with '-'");
128                logger.warn(stringBuilder.toString());
129
130                return cleanSeq.toString();
131        }
132
133        /**
134         * Checks if the sequence contains invalid characters.
135         * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
136         * If yes, it will return a new sequence where invalid characters are replaced with '-'.
137         * If no, it will simply return the input sequence.
138         *
139         * @param sequence
140         *              protein sequence to be check for invalid characters.
141         * @return
142         *              a sequence with no invalid characters.
143         */
144        public static final String checkSequence(String sequence){
145                return checkSequence(sequence, null);
146        }
147
148        /**
149         * Checks if the sequence contains invalid characters.
150         * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
151         * If yes, it will return a new sequence where invalid characters are replaced with '-'.
152         * If no, it will simply return the input sequence.
153         *
154         * @param sequence
155         *              protein sequence to be check for invalid characters.
156         * @param cSet
157         *              character set which define the valid characters.
158         * @return
159         *              a sequence with no invalid characters.
160         */
161        public static final String checkSequence(String sequence, Set<Character> cSet){
162                boolean containInvalid = false;
163                if(cSet != null){
164                        containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, cSet);
165                }else{
166                        containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, PeptideProperties.standardAASet);
167                }
168                if(containInvalid){
169                        String cSeq = cleanSequence(sequence, cSet);
170                        logger.warn("There exists invalid characters in the sequence. Computed results might not be precise.");
171                        logger.warn("To remove this warning: Please use org.biojava.nbio.aaproperties.Utils.cleanSequence to clean sequence.");
172
173                        return cSeq;
174                }
175                else{
176                        return sequence;
177                }
178        }
179}