001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import org.biojava.utils.lsid.LifeScienceIdentifier; 025 026/** 027 * <code>SeqIOConstants</code> contains constants used to identify 028 * sequence formats, alphabets etc, in the context of reading and 029 * writing sequences. 030 * 031 * <p>An <code>int</code> used to specify symbol alphabet and 032 * sequence format type is derived thus:</p> 033 * 034 * <ul> 035 * <li> 036 * The two least significant bytes are reserved for format types 037 * such as RAW, FASTA, EMBL etc. 038 * </li> 039 * 040 * <li> 041 * The two most significant bytes are reserved for alphabet and 042 * symbol information such as AMBIGUOUS, DNA, RNA, AA etc. 043 * </li> 044 * 045 * <li> 046 * Bitwise OR combinations of each component <code>int</code> are used 047 * to specify combinations of format type and symbol information. To 048 * derive an <code>int</code> identifier for DNA with ambiguity codes 049 * in Fasta format, bitwise OR the AMBIGUOUS, DNA and FASTA values. 050 * </li> 051 * </ul> 052 * 053 * @author Keith James 054 */ 055public final class SeqIOConstants 056{ 057 /** 058 * <code>AMBIGUOUS</code> indicates that a sequence contains 059 * ambiguity symbols. The first bit of the most significant word 060 * of the int is set. 061 */ 062 public static final int AMBIGUOUS = 1 << 16; 063 064 /** 065 * <code>DNA</code> indicates that a sequence contains DNA 066 * (deoxyribonucleic acid) symbols. The second bit of the most 067 * significant word of the int is set. 068 */ 069 public static final int DNA = 1 << 17; 070 071 /** 072 * <code>RNA</code> indicates that a sequence contains RNA 073 * (ribonucleic acid) symbols. The third bit of the most 074 * significant word of the int is set. 075 */ 076 public static final int RNA = 1 << 18; 077 078 /** 079 * <code>AA</code> indicates that a sequence contains AA (amino 080 * acid) symbols. The fourth bit of the most significant word of 081 * the int is set. 082 */ 083 public static final int AA = 1 << 19; 084 085 /** 086 * <code>INTEGER</code> indicates that a sequence contains integer 087 * alphabet symbols, such as used to describe sequence quality 088 * data. The fifth bit of the most significant word of the int is 089 * set. 090 */ 091 public static final int INTEGER = 1 << 20; 092 093 /** 094 * <code>UNKNOWN</code> indicates that the sequence format is 095 * unknown. 096 */ 097 public static final int UNKNOWN = 0; 098 099 /** 100 * <code>RAW</code> indicates that the sequence format is raw 101 * (symbols only). 102 */ 103 public static final int RAW = 1; 104 105 /** 106 * <code>FASTA</code> indicates that the sequence format is Fasta. 107 */ 108 public static final int FASTA = 2; 109 110 /** 111 * <code>NBRF</code> indicates that the sequence format is NBRF. 112 */ 113 public static final int NBRF = 3; 114 115 /** 116 * <code>IG</code> indicates that the sequence format is IG. 117 */ 118 public static final int IG = 4; 119 120 /** 121 * <code>EMBL</code> indicates that the sequence format is EMBL. 122 */ 123 public static final int EMBL = 10; 124 125 /** 126 * <code>SWISSPROT</code> indicates that the sequence format is 127 * SWISSPROT. Always protein, so already had the AA bit set. 128 */ 129 public static final int SWISSPROT = 11 | AA; 130 131 /** 132 * <code>GENBANK</code> indicates that the sequence format is 133 * GENBANK. 134 */ 135 public static final int GENBANK = 12; 136 137 /** 138 * <code>GENPEPT</code> indicates that the sequence format is 139 * GENPEPT. Always protein, so already had the AA bit set. 140 */ 141 public static final int GENPEPT = 13 | AA; 142 143 /** 144 * <code>REFSEQ</code> indicates that the sequence format is 145 * REFSEQ. 146 */ 147 public static final int REFSEQ = 14; 148 149 /** 150 * <code>GCG</code> indicates that the sequence format is GCG. 151 */ 152 public static final int GCG = 15; 153 154 /** 155 * <code>GFF</code> indicates that the sequence format is GFF. 156 */ 157 public static final int GFF = 20; 158 159 /** 160 * <code>PDB</code> indicates that the sequence format is 161 * PDB. Always protein, so already had the AA bit set. 162 */ 163 public static final int PDB = 21 | AA; 164 165 /** 166 * <code>PHRED</code> indicates that the sequence format is 167 * PHRED. Always DNA, so already had the DNA bit set. Also has 168 * INTEGER bit set for quality data. 169 */ 170 public static final int PHRED = 30 | DNA | INTEGER; 171 172 /** 173 * <code>EMBL_DNA</code> premade EMBL | DNA. 174 */ 175 public static final int EMBL_DNA = EMBL | DNA; 176 177 /** 178 * <code>EMBL_RNA</code> premade EMBL | RNA. 179 */ 180 public static final int EMBL_RNA = EMBL | RNA; 181 182 /** 183 * <code>EMBL_AA</code> premade EMBL | AA. 184 */ 185 public static final int EMBL_AA = EMBL | AA; 186 187 /** 188 * <code>GENBANK_DNA</code> premade GENBANK | DNA. 189 */ 190 public static final int GENBANK_DNA = GENBANK | DNA; 191 192 /** 193 * <code>GENBANK_DNA</code> premade GENBANK | RNA. 194 */ 195 public static final int GENBANK_RNA = GENBANK | RNA; 196 197 /** 198 * <code>GENBANK_DNA</code> premade GENBANK | AA. 199 */ 200 public static final int GENBANK_AA = GENBANK | AA; 201 202 /** 203 * <code>REFSEQ_DNA</code> premade REFSEQ | DNA. 204 */ 205 public static final int REFSEQ_DNA = REFSEQ | DNA; 206 207 /** 208 * <code>REFSEQ_RNA</code> premade REFSEQ | RNA. 209 */ 210 public static final int REFSEQ_RNA = REFSEQ | RNA; 211 212 /** 213 * <code>REFSEQ_AA</code> premade REFSEQ | AA. 214 */ 215 public static final int REFSEQ_AA = REFSEQ | AA; 216 217 /** 218 * <code>FASTA_DNA</code> premade FASTA | DNA. 219 */ 220 public static final int FASTA_DNA = FASTA | DNA; 221 222 /** 223 * <code>FASTA_RNA</code> premade FASTA | RNA. 224 */ 225 public static final int FASTA_RNA = FASTA | RNA; 226 227 /** 228 * <code>FASTA_AA</code> premade FASTA | AA. 229 */ 230 public static final int FASTA_AA = FASTA | AA; 231 232 /** 233 * <code>LSID_FASTA_DNA</code> sequence format LSID for Fasta DNA. 234 */ 235 public static final LifeScienceIdentifier LSID_FASTA_DNA = 236 LifeScienceIdentifier.valueOf("open-bio.org", "fasta", "dna"); 237 238 /** 239 * <code>LSID_FASTA_RNA</code> sequence format LSID for Fasta RNA. 240 */ 241 public static final LifeScienceIdentifier LSID_FASTA_RNA = 242 LifeScienceIdentifier.valueOf("open-bio.org", "fasta", "rna"); 243 244 /** 245 * <code>LSID_FASTA_AA</code> sequence format LSID for Fasta AA. 246 */ 247 public static final LifeScienceIdentifier LSID_FASTA_AA = 248 LifeScienceIdentifier.valueOf("open-bio.org", "fasta", "protein"); 249 250 /** 251 * <code>LSID_EMBL_DNA</code> sequence format LSID for EMBL DNA. 252 */ 253 public static final LifeScienceIdentifier LSID_EMBL_DNA = 254 LifeScienceIdentifier.valueOf("open-bio.org", "embl", "dna"); 255 256 /** 257 * <code>LSID_EMBL_RNA</code> sequence format LSID for EMBL RNA. 258 */ 259 public static final LifeScienceIdentifier LSID_EMBL_RNA = 260 LifeScienceIdentifier.valueOf("open-bio.org", "embl", "rna"); 261 262 /** 263 * <code>LSID_EMBL_AA</code> sequence format LSID for EMBL AA. 264 */ 265 public static final LifeScienceIdentifier LSID_EMBL_AA = 266 LifeScienceIdentifier.valueOf("open-bio.org", "embl", "protein"); 267 268 /** 269 * <code>LSID_GENBANK_DNA</code> sequence format LSID for Genbank 270 * DNA. 271 */ 272 public static final LifeScienceIdentifier LSID_GENBANK_DNA = 273 LifeScienceIdentifier.valueOf("open-bio.org", "genbank", "dna"); 274 275 /** 276 * <code>LSID_GENBANK_RNA</code> sequence format LSID for Genbank 277 * RNA. 278 */ 279 public static final LifeScienceIdentifier LSID_GENBANK_RNA = 280 LifeScienceIdentifier.valueOf("open-bio.org", "genbank", "rna"); 281 282 /** 283 * <code>LSID_GENBANK_AA</code> sequence format LSID for Genbank 284 * AA. 285 */ 286 public static final LifeScienceIdentifier LSID_GENBANK_AA = 287 LifeScienceIdentifier.valueOf("open-bio.org", "genbank", "protein"); 288 289 /** 290 * <code>LSID_SWISSPROT</code> sequence format LSID for Swissprot. 291 */ 292 public static final LifeScienceIdentifier LSID_SWISSPROT = 293 LifeScienceIdentifier.valueOf("open-bio.org", "swiss", "protein"); 294}