Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.io;
022
023import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
024import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
025import org.biojava.nbio.core.sequence.features.FeatureInterface;
026import org.biojava.nbio.core.sequence.io.template.GenbankHeaderFormatInterface;
027import org.biojava.nbio.core.sequence.template.AbstractSequence;
028import org.biojava.nbio.core.sequence.template.Compound;
029import org.biojava.nbio.core.util.StringManipulationHelper;
030
031import java.text.SimpleDateFormat;
032import java.util.*;
033
034public class GenericGenbankHeaderFormat<S extends AbstractSequence<C>, C extends Compound>
035                extends GenericInsdcHeaderFormat<S, C> implements
036                GenbankHeaderFormatInterface<S, C> {
037        private static final int HEADER_WIDTH = 12;
038        private static final String lineSep = "%n";
039        private String seqType = null;
040
041        public GenericGenbankHeaderFormat() {
042                seqType = null;
043        }
044
045        public GenericGenbankHeaderFormat(String seqType) {
046                this.seqType = seqType;
047        }
048
049        /**
050         * Used in the the 'header' of each GenBank record.
051         *
052         * @param tag
053         * @param text
054         */
055        private String _write_single_line(String tag, String text) {
056                assert tag.length() < HEADER_WIDTH;
057                return StringManipulationHelper.padRight(tag, HEADER_WIDTH)
058                                + text.replace('\n', ' ') + lineSep;
059        }
060
061        /**
062         * Used in the the 'header' of each GenBank record.
063         *
064         * @param tag
065         * @param text
066         */
067        private String _write_multi_line(String tag, String text) {
068                if (text == null) {
069                        text = "";
070                }
071                int max_len = MAX_WIDTH - HEADER_WIDTH;
072                ArrayList<String> lines = _split_multi_line(text, max_len);
073                String output = _write_single_line(tag, lines.get(0));
074                for (int i = 1; i < lines.size(); i++) {
075                        output += _write_single_line("", lines.get(i));
076                }
077                return output;
078        }
079
080        /**
081         * used for DBLINK and any similar later line types. If the list of strings
082         * is empty, nothing is written.
083         *
084         * @param tag
085         * @param text_list
086         */
087        /*
088         * private String _write_multi_entries(String tag, ArrayList<String>
089         * text_list) { String output = _write_single_line(tag,text_list.remove(0));
090         * for(String s : text_list) { output += _write_single_line("", s); } return
091         * output; }
092         */
093
094        private String _get_date(S sequence) {
095                Date sysdate = Calendar.getInstance().getTime();
096
097                // String default_date =
098                // sysdate.get(Calendar.DAY_OF_MONTH)+"-"+sysdate.get(Calendar.MONTH)+"-"+sysdate.get(Calendar.YEAR);
099                String default_date = new SimpleDateFormat("dd-MMM-yyyy")
100                                .format(sysdate);
101                return default_date;
102                /*
103                 * try : date = record.annotations["date"] except KeyError : return
104                 * default #Cope with a list of one string: if isinstance(date, list)
105                 * and len(date)==1 : date = date[0] #TODO - allow a Python date object
106                 * if not isinstance(date, str) or len(date) != 11 \ or date[2] != "-"
107                 * or date[6] != "-" \ or not date[:2].isdigit() or not
108                 * date[7:].isdigit() \ or int(date[:2]) > 31 \ or date[3:6] not in
109                 * ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP",
110                 * "OCT", "NOV", "DEC"] : #TODO - Check is a valid date (e.g. not 31
111                 * Feb) return default return date
112                 */
113        }
114
115        private String _get_data_division(S sequence) {
116                return UNKNOWN_DNA;
117                /*
118                 * try: division = record.annotations["data_file_division"] except
119                 * KeyError: division = "UNK" if division in ["PRI", "ROD", "MAM",
120                 * "VRT", "INV", "PLN", "BCT", "VRL", "PHG", "SYN", "UNA", "EST", "PAT",
121                 * "STS", "GSS", "HTG", "HTC", "ENV", "CON"]: #Good, already GenBank
122                 * style # PRI - primate sequences # ROD - rodent sequences # MAM -
123                 * other mammalian sequences # VRT - other vertebrate sequences # INV -
124                 * invertebrate sequences # PLN - plant, fungal, and algal sequences #
125                 * BCT - bacterial sequences [plus archea] # VRL - viral sequences # PHG
126                 * - bacteriophage sequences # SYN - synthetic sequences # UNA -
127                 * unannotated sequences # EST - EST sequences (expressed sequence tags)
128                 * # PAT - patent sequences # STS - STS sequences (sequence tagged
129                 * sites) # GSS - GSS sequences (genome survey sequences) # HTG - HTGS
130                 * sequences (high throughput genomic sequences) # HTC - HTC sequences
131                 * (high throughput cDNA sequences) # ENV - Environmental sampling
132                 * sequences # CON - Constructed sequences # #(plus UNK for unknown)
133                 * pass else: #See if this is in EMBL style: # Division Code #
134                 * ----------------- ---- # Bacteriophage PHG - common # Environmental
135                 * Sample ENV - common # Fungal FUN - map to PLN (plants + fungal) #
136                 * Human HUM - map to PRI (primates) # Invertebrate INV - common # Other
137                 * Mammal MAM - common # Other Vertebrate VRT - common # Mus musculus
138                 * MUS - map to ROD (rodent) # Plant PLN - common # Prokaryote PRO - map
139                 * to BCT (poor name) # Other Rodent ROD - common # Synthetic SYN -
140                 * common # Transgenic TGN - ??? map to SYN ??? # Unclassified UNC - map
141                 * to UNK # Viral VRL - common # #(plus XXX for submiting which we can
142                 * map to UNK) embl_to_gbk = {"FUN":"PLN", "HUM":"PRI", "MUS":"ROD",
143                 * "PRO":"BCT", "UNC":"UNK", "XXX":"UNK", } try: division =
144                 * embl_to_gbk[division] except KeyError: division = "UNK" assert
145                 * len(division)==3 return division
146                 */
147        }
148
149        /**
150         * Write the LOCUS line.
151         *
152         * @param sequence
153         * @param seqType
154         */
155        private String _write_the_first_line(S sequence) {
156                /*
157                 * locus = record.name if not locus or locus == "<unknown name>": locus
158                 * = record.id if not locus or locus == "<unknown id>": locus =
159                 * self._get_annotation_str(record, "accession", just_first=True)\
160                 */
161                String locus;
162                try {
163                        locus = sequence.getAccession().getID();
164                } catch (Exception e) {
165                        locus = "";
166                }
167                if (locus.length() > 16) {
168                        throw new RuntimeException("Locus identifier " + locus
169                                        + " is too long");
170                }
171
172                String units = "";
173                String mol_type = "";
174                if (sequence.getCompoundSet() instanceof DNACompoundSet) {
175                        units = "bp";
176                        mol_type = "DNA";
177                } else if (sequence.getCompoundSet() instanceof DNACompoundSet) {
178                        units = "bp";
179                        mol_type = "RNA";
180                } else if (sequence.getCompoundSet() instanceof AminoAcidCompoundSet) {
181                        units = "aa";
182                        mol_type = "";
183                } else {
184                        throw new RuntimeException(
185                                        "Need a DNACompoundSet, RNACompoundSet, or an AminoAcidCompoundSet");
186                }
187
188                String division = _get_data_division(sequence);
189
190                if (seqType != null) {
191                        division = seqType;
192                }
193                assert units.length() == 2;
194
195                // the next line does not seem right.. seqType == linear
196                // uncommenting for now
197                //assert division.length() == 3;
198
199                StringBuilder sb = new StringBuilder();
200                Formatter formatter = new Formatter(sb, Locale.US);
201                formatter
202                                .format("LOCUS       %s %s %s    %s           %s %s" + lineSep,
203                                                StringManipulationHelper.padRight(locus, 16),
204                                                StringManipulationHelper.padLeft(
205                                                                Integer.toString(sequence.getLength()), 11),
206                                                units, StringManipulationHelper.padRight(mol_type, 6), division,
207                                                _get_date(sequence));
208                String output = formatter.toString();
209                formatter.close();
210                return output;
211                /*
212                 * assert len(line) == 79+1, repr(line) #plus one for new line
213                 *
214                 * assert line[12:28].rstrip() == locus, \ 'LOCUS line does not contain
215                 * the locus at the expected position:\n' + line assert line[28:29] ==
216                 * " " assert line[29:40].lstrip() == str(len(record)), \ 'LOCUS line
217                 * does not contain the length at the expected position:\n' + line
218                 *
219                 * #Tests copied from Bio.GenBank.Scanner assert line[40:44] in [' bp ',
220                 * ' aa '] , \ 'LOCUS line does not contain size units at expected
221                 * position:\n' + line assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'],
222                 * \ 'LOCUS line does not have valid strand type (Single stranded,
223                 * ...):\n' + line assert line[47:54].strip() == "" \ or
224                 * line[47:54].strip().find('DNA') != -1 \ or
225                 * line[47:54].strip().find('RNA') != -1, \ 'LOCUS line does not contain
226                 * valid sequence type (DNA, RNA, ...):\n' + line assert line[54:55] ==
227                 * ' ', \ 'LOCUS line does not contain space at position 55:\n' + line
228                 * assert line[55:63].strip() in ['', 'linear', 'circular'], \ 'LOCUS
229                 * line does not contain valid entry (linear, circular, ...):\n' + line
230                 * assert line[63:64] == ' ', \ 'LOCUS line does not contain space at
231                 * position 64:\n' + line assert line[67:68] == ' ', \ 'LOCUS line does
232                 * not contain space at position 68:\n' + line assert line[70:71] ==
233                 * '-', \ 'LOCUS line does not contain - at position 71 in date:\n' +
234                 * line assert line[74:75] == '-', \ 'LOCUS line does not contain - at
235                 * position 75 in date:\n' + line
236                 */
237        }
238
239        /**
240         * This is a bit complicated due to the range of possible ways people might
241         * have done their annotation... Currently the parser uses a single string
242         * with newlines. A list of lines is also reasonable. A single (long) string
243         * is perhaps the most natural of all. This means we may need to deal with
244         * line wrapping.
245         *
246         * @param sequence
247         */
248        private String _write_comment(S sequence) {
249                ArrayList<String> comments = sequence.getNotesList();
250                String output = _write_multi_line("COMMENT", comments.remove(0));
251                for (String comment : comments) {
252                        output += _write_multi_line("", comment);
253                }
254
255                return output;
256        }
257
258        @Override
259        public String getHeader(S sequence) {
260                String header = _write_the_first_line(sequence);
261                String acc_with_version;
262                String accession;
263                try {
264                        acc_with_version = sequence.getAccession().getID();
265                        accession = acc_with_version.split("\\.", 1)[0];
266                } catch (Exception e) {
267                        acc_with_version = "";
268                        accession = "";
269                }
270                String description = sequence.getDescription();
271                if ("<unknown description>".equals(description) || description == null) {
272                        description = ".";
273                }
274                header += _write_multi_line("DEFINITION", description);
275                header += _write_multi_line("ACCESSION", accession);
276                header += _write_multi_line("VERSION", acc_with_version);
277
278                /*
279                 * gi = self._get_annotation_str(record, "gi", just_first=True)
280                 *
281                 * self._write_single_line("ACCESSION", accession) if gi != ".":
282                 * self._write_single_line("VERSION", "%s  GI:%s" \ % (acc_with_version,
283                 * gi)) else: self._write_single_line("VERSION", "%s" %
284                 * (acc_with_version))
285                 *
286                 * #The NCBI only expect two types of link so far, #e.g. "Project:28471"
287                 * and "Trace Assembly Archive:123456" #TODO - Filter the dbxrefs list
288                 * to just these? self._write_multi_entries("DBLINK", record.dbxrefs)
289                 *
290                 * try: #List of strings #Keywords should be given separated with semi
291                 * colons, keywords = "; ".join(record.annotations["keywords"]) #with a
292                 * trailing period: if not keywords.endswith(".") : keywords += "."
293                 * except KeyError: #If no keywords, there should be just a period:
294                 * keywords = "."
295                 */
296
297                header += _write_multi_line("KEYWORDS", ".");
298
299                /*
300                 * if "segment" in record.annotations: #Deal with SEGMENT line found
301                 * only in segmented records, #e.g. AH000819 segment =
302                 * record.annotations["segment"] if isinstance(segment, list): assert
303                 * len(segment)==1, segment segment = segment[0]
304                 * self._write_single_line("SEGMENT", segment)
305                 *
306                 * self._write_multi_line("SOURCE", \ self._get_annotation_str(record,
307                 * "source"))
308                 */
309
310                header += _write_multi_line("SOURCE", sequence.getSource());
311
312                /*
313                 * #The ORGANISM line MUST be a single line, as any continuation is the
314                 * taxonomy org = self._get_annotation_str(record, "organism") if
315                 * len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: org =
316                 * org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..."
317                 * self._write_single_line("  ORGANISM", org) try: #List of strings
318                 * #Taxonomy should be given separated with semi colons, taxonomy =
319                 * "; ".join(record.annotations["taxonomy"]) #with a trailing period: if
320                 * not taxonomy.endswith(".") : taxonomy += "." except KeyError:
321                 * taxonomy = "." self._write_multi_line("", taxonomy)
322                 *
323                 * if "references" in record.annotations: self._write_references(record)
324                 */
325                if (!sequence.getNotesList().isEmpty()) {
326                        header += _write_comment(sequence);
327                }
328
329                header += "FEATURES             Location/Qualifiers" + lineSep;
330                int rec_length = sequence.getLength();
331                for (FeatureInterface<AbstractSequence<C>, C> feature : sequence
332                                .getFeatures()) {
333                        header += _write_feature(feature, rec_length);
334                }
335
336                return header;
337        }
338
339}