001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import org.biojava.utils.lsid.LifeScienceIdentifier;
025
026/**
027 * <code>SeqIOConstants</code> contains constants used to identify
028 * sequence formats, alphabets etc, in the context of reading and
029 * writing sequences.
030 *
031 * <p>An <code>int</code> used to specify symbol alphabet and
032 * sequence format type is derived thus:</p>
033 *
034 * <ul>
035 *   <li>
036 *    The two least significant bytes are reserved for format types
037 *    such as RAW, FASTA, EMBL etc.
038 *   </li>
039 *
040 *   <li>
041 *    The two most significant bytes are reserved for alphabet and
042 *    symbol information such as AMBIGUOUS, DNA, RNA, AA etc.
043 *   </li>
044 *
045 *   <li>
046 *    Bitwise OR combinations of each component <code>int</code> are used
047 *    to specify combinations of format type and symbol information. To
048 *    derive an <code>int</code> identifier for DNA with ambiguity codes
049 *    in Fasta format, bitwise OR the AMBIGUOUS, DNA and FASTA values.
050 *   </li>
051 * </ul>
052 *
053 * @author Keith James
054 */
055public final class SeqIOConstants
056{
057    /**
058     * <code>AMBIGUOUS</code> indicates that a sequence contains
059     * ambiguity symbols. The first bit of the most significant word
060     * of the int is set.
061     */
062    public static final int AMBIGUOUS = 1 << 16;
063
064    /**
065     * <code>DNA</code> indicates that a sequence contains DNA
066     * (deoxyribonucleic acid) symbols. The second bit of the most
067     * significant word of the int is set.
068     */
069    public static final int DNA = 1 << 17;
070
071    /**
072     * <code>RNA</code> indicates that a sequence contains RNA
073     * (ribonucleic acid) symbols. The third bit of the most
074     * significant word of the int is set.
075     */
076    public static final int RNA = 1 << 18;
077
078    /**
079     * <code>AA</code> indicates that a sequence contains AA (amino
080     * acid) symbols. The fourth bit of the most significant word of
081     * the int is set.
082     */
083    public static final int AA = 1 << 19;
084
085    /**
086     * <code>INTEGER</code> indicates that a sequence contains integer
087     * alphabet symbols, such as used to describe sequence quality
088     * data. The fifth bit of the most significant word of the int is
089     * set.
090     */
091    public static final int INTEGER = 1 << 20;
092
093    /**
094     * <code>UNKNOWN</code> indicates that the sequence format is
095     * unknown.
096     */
097    public static final int UNKNOWN = 0;
098
099    /**
100     * <code>RAW</code> indicates that the sequence format is raw
101     * (symbols only).
102     */
103    public static final int RAW = 1;
104
105    /**
106     * <code>FASTA</code> indicates that the sequence format is Fasta.
107     */
108    public static final int FASTA = 2;
109
110    /**
111     * <code>NBRF</code> indicates that the sequence format is NBRF.
112     */
113    public static final int NBRF = 3;
114
115    /**
116     * <code>IG</code> indicates that the sequence format is IG.
117     */
118    public static final int IG = 4;
119
120    /**
121     * <code>EMBL</code> indicates that the sequence format is EMBL.
122     */
123    public static final int EMBL = 10;
124
125    /**
126     * <code>SWISSPROT</code> indicates that the sequence format is
127     * SWISSPROT. Always protein, so already had the AA bit set.
128     */
129    public static final int SWISSPROT = 11 | AA;
130
131    /**
132     * <code>GENBANK</code> indicates that the sequence format is
133     * GENBANK.
134     */
135    public static final int GENBANK = 12;
136
137    /**
138     * <code>GENPEPT</code> indicates that the sequence format is
139     * GENPEPT. Always protein, so already had the AA bit set.
140     */
141    public static final int GENPEPT = 13 | AA;
142
143    /**
144     * <code>REFSEQ</code> indicates that the sequence format is
145     * REFSEQ.
146     */
147    public static final int REFSEQ = 14;
148
149    /**
150     * <code>GCG</code> indicates that the sequence format is GCG.
151     */
152    public static final int GCG = 15;
153
154    /**
155     * <code>GFF</code> indicates that the sequence format is GFF.
156     */
157    public static final int GFF = 20;
158
159    /**
160     * <code>PDB</code> indicates that the sequence format is
161     * PDB. Always protein, so already had the AA bit set.
162     */
163    public static final int PDB = 21 | AA;
164
165    /**
166     * <code>PHRED</code> indicates that the sequence format is
167     * PHRED. Always DNA, so already had the DNA bit set. Also has
168     * INTEGER bit set for quality data.
169     */
170    public static final int PHRED = 30 | DNA | INTEGER;
171
172    /**
173     * <code>EMBL_DNA</code> premade EMBL | DNA.
174     */
175    public static final int EMBL_DNA = EMBL | DNA;
176
177    /**
178     * <code>EMBL_RNA</code> premade EMBL | RNA.
179     */
180    public static final int EMBL_RNA = EMBL | RNA;
181
182    /**
183     * <code>EMBL_AA</code> premade EMBL | AA.
184     */
185    public static final int EMBL_AA = EMBL | AA;
186
187    /**
188     * <code>GENBANK_DNA</code> premade GENBANK | DNA.
189     */
190    public static final int GENBANK_DNA = GENBANK | DNA;
191
192    /**
193     * <code>GENBANK_DNA</code> premade GENBANK | RNA.
194     */
195    public static final int GENBANK_RNA = GENBANK | RNA;
196
197    /**
198     * <code>GENBANK_DNA</code> premade GENBANK | AA.
199     */
200    public static final int  GENBANK_AA = GENBANK | AA;
201
202    /**
203     * <code>REFSEQ_DNA</code> premade REFSEQ | DNA.
204     */
205    public static final int REFSEQ_DNA = REFSEQ | DNA;
206
207    /**
208     * <code>REFSEQ_RNA</code> premade REFSEQ | RNA.
209     */
210    public static final int REFSEQ_RNA = REFSEQ | RNA;
211
212    /**
213     * <code>REFSEQ_AA</code> premade REFSEQ | AA.
214     */
215    public static final int REFSEQ_AA = REFSEQ | AA;
216
217    /**
218     * <code>FASTA_DNA</code> premade FASTA | DNA.
219     */
220    public static final int FASTA_DNA = FASTA | DNA;
221
222    /**
223     * <code>FASTA_RNA</code> premade FASTA | RNA.
224     */
225    public static final int FASTA_RNA = FASTA | RNA;
226
227    /**
228     * <code>FASTA_AA</code> premade FASTA | AA.
229     */
230    public static final int FASTA_AA = FASTA | AA;
231
232    /**
233     * <code>LSID_FASTA_DNA</code> sequence format LSID for Fasta DNA.
234     */
235    public static final LifeScienceIdentifier LSID_FASTA_DNA =
236        LifeScienceIdentifier.valueOf("open-bio.org", "fasta", "dna");
237
238    /**
239     * <code>LSID_FASTA_RNA</code> sequence format LSID for Fasta RNA.
240     */
241    public static final LifeScienceIdentifier LSID_FASTA_RNA =
242        LifeScienceIdentifier.valueOf("open-bio.org", "fasta", "rna");
243
244    /**
245     * <code>LSID_FASTA_AA</code> sequence format LSID for Fasta AA.
246     */
247    public static final LifeScienceIdentifier LSID_FASTA_AA =
248        LifeScienceIdentifier.valueOf("open-bio.org", "fasta", "protein");
249
250    /**
251     * <code>LSID_EMBL_DNA</code> sequence format LSID for EMBL DNA.
252     */
253    public static final LifeScienceIdentifier LSID_EMBL_DNA =
254        LifeScienceIdentifier.valueOf("open-bio.org", "embl", "dna");
255
256    /**
257     * <code>LSID_EMBL_RNA</code> sequence format LSID for EMBL RNA.
258     */
259    public static final LifeScienceIdentifier LSID_EMBL_RNA =
260        LifeScienceIdentifier.valueOf("open-bio.org", "embl", "rna");
261
262    /**
263     * <code>LSID_EMBL_AA</code> sequence format LSID for EMBL AA.
264     */
265    public static final LifeScienceIdentifier LSID_EMBL_AA =
266        LifeScienceIdentifier.valueOf("open-bio.org", "embl", "protein");
267
268    /**
269     * <code>LSID_GENBANK_DNA</code> sequence format LSID for Genbank
270     * DNA.
271     */
272    public static final LifeScienceIdentifier LSID_GENBANK_DNA =
273        LifeScienceIdentifier.valueOf("open-bio.org", "genbank", "dna");
274
275    /**
276     * <code>LSID_GENBANK_RNA</code> sequence format LSID for Genbank
277     * RNA.
278     */
279    public static final LifeScienceIdentifier LSID_GENBANK_RNA =
280        LifeScienceIdentifier.valueOf("open-bio.org", "genbank", "rna");
281
282    /**
283     * <code>LSID_GENBANK_AA</code> sequence format LSID for Genbank
284     * AA.
285     */
286    public static final LifeScienceIdentifier LSID_GENBANK_AA =
287        LifeScienceIdentifier.valueOf("open-bio.org", "genbank", "protein");
288
289    /**
290     * <code>LSID_SWISSPROT</code> sequence format LSID for Swissprot.
291     */
292    public static final LifeScienceIdentifier LSID_SWISSPROT =
293        LifeScienceIdentifier.valueOf("open-bio.org", "swiss", "protein");
294}