Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.io;
022
023import org.biojava.nbio.core.sequence.AccessionID;
024import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
025import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
026import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
027import org.biojava.nbio.core.sequence.features.FeatureInterface;
028import org.biojava.nbio.core.sequence.io.template.GenbankHeaderFormatInterface;
029import org.biojava.nbio.core.sequence.template.AbstractSequence;
030import org.biojava.nbio.core.sequence.template.Compound;
031import org.biojava.nbio.core.util.StringManipulationHelper;
032
033import java.text.SimpleDateFormat;
034import java.util.*;
035
036public class GenericGenbankHeaderFormat<S extends AbstractSequence<C>, C extends Compound>
037                extends GenericInsdcHeaderFormat<S, C> implements
038                GenbankHeaderFormatInterface<S, C> {
039        private static final int HEADER_WIDTH = 12;
040        private static final String lineSep = System.lineSeparator();
041        private String seqType = null;
042        private boolean useOriginalHeader = false;
043
044        public GenericGenbankHeaderFormat() {
045                seqType = null;
046        }
047
048        public GenericGenbankHeaderFormat(String seqType) {
049                this.seqType = seqType;
050        }
051        
052        public GenericGenbankHeaderFormat(boolean useOriginalHeader) {
053                this.useOriginalHeader = useOriginalHeader;
054        }
055
056        /**
057         * Used in the the 'header' of each GenBank record.
058         *
059         * @param tag
060         * @param text
061         */
062        private String _write_single_line(String tag, String text) {
063                assert tag.length() < HEADER_WIDTH;
064                return StringManipulationHelper.padRight(tag, HEADER_WIDTH)
065                                + text.replace('\n', ' ') + lineSep;
066        }
067
068        /**
069         * Used in the the 'header' of each GenBank record.
070         *
071         * @param tag
072         * @param text
073         */
074        private String _write_multi_line(String tag, String text) {
075                if (text == null) {
076                        text = "";
077                }
078                int max_len = MAX_WIDTH - HEADER_WIDTH;
079                List<String> lines = _split_multi_line(text, max_len);
080                String output = _write_single_line(tag, lines.get(0));
081                for (int i = 1; i < lines.size(); i++) {
082                        output += _write_single_line("", lines.get(i));
083                }
084                return output;
085        }
086
087        private String _get_date(S sequence) {
088                Date sysdate = Calendar.getInstance().getTime();
089
090                // String default_date =
091                // sysdate.get(Calendar.DAY_OF_MONTH)+"-"+sysdate.get(Calendar.MONTH)+"-"+sysdate.get(Calendar.YEAR);
092                String default_date = new SimpleDateFormat("dd-MMM-yyyy")
093                                .format(sysdate);
094                return default_date;
095                /*
096                 * try : date = record.annotations["date"] except KeyError : return
097                 * default #Cope with a list of one string: if isinstance(date, list)
098                 * and len(date)==1 : date = date[0] #TODO - allow a Python date object
099                 * if not isinstance(date, str) or len(date) != 11 \ or date[2] != "-"
100                 * or date[6] != "-" \ or not date[:2].isdigit() or not
101                 * date[7:].isdigit() \ or int(date[:2]) > 31 \ or date[3:6] not in
102                 * ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP",
103                 * "OCT", "NOV", "DEC"] : #TODO - Check is a valid date (e.g. not 31
104                 * Feb) return default return date
105                 */
106        }
107
108        private String _get_data_division(S sequence) {
109                return UNKNOWN_DNA;
110                /*
111                 * try: division = record.annotations["data_file_division"] except
112                 * KeyError: division = "UNK" if division in ["PRI", "ROD", "MAM",
113                 * "VRT", "INV", "PLN", "BCT", "VRL", "PHG", "SYN", "UNA", "EST", "PAT",
114                 * "STS", "GSS", "HTG", "HTC", "ENV", "CON"]: #Good, already GenBank
115                 * style # PRI - primate sequences # ROD - rodent sequences # MAM -
116                 * other mammalian sequences # VRT - other vertebrate sequences # INV -
117                 * invertebrate sequences # PLN - plant, fungal, and algal sequences #
118                 * BCT - bacterial sequences [plus archea] # VRL - viral sequences # PHG
119                 * - bacteriophage sequences # SYN - synthetic sequences # UNA -
120                 * unannotated sequences # EST - EST sequences (expressed sequence tags)
121                 * # PAT - patent sequences # STS - STS sequences (sequence tagged
122                 * sites) # GSS - GSS sequences (genome survey sequences) # HTG - HTGS
123                 * sequences (high throughput genomic sequences) # HTC - HTC sequences
124                 * (high throughput cDNA sequences) # ENV - Environmental sampling
125                 * sequences # CON - Constructed sequences # #(plus UNK for unknown)
126                 * pass else: #See if this is in EMBL style: # Division Code #
127                 * ----------------- ---- # Bacteriophage PHG - common # Environmental
128                 * Sample ENV - common # Fungal FUN - map to PLN (plants + fungal) #
129                 * Human HUM - map to PRI (primates) # Invertebrate INV - common # Other
130                 * Mammal MAM - common # Other Vertebrate VRT - common # Mus musculus
131                 * MUS - map to ROD (rodent) # Plant PLN - common # Prokaryote PRO - map
132                 * to BCT (poor name) # Other Rodent ROD - common # Synthetic SYN -
133                 * common # Transgenic TGN - ??? map to SYN ??? # Unclassified UNC - map
134                 * to UNK # Viral VRL - common # #(plus XXX for submiting which we can
135                 * map to UNK) embl_to_gbk = {"FUN":"PLN", "HUM":"PRI", "MUS":"ROD",
136                 * "PRO":"BCT", "UNC":"UNK", "XXX":"UNK", } try: division =
137                 * embl_to_gbk[division] except KeyError: division = "UNK" assert
138                 * len(division)==3 return division
139                 */
140        }
141
142        /**
143         * Write the LOCUS line.
144         *
145         * @param sequence
146         */
147        private String _write_the_first_line(S sequence) {
148                /*
149                 * locus = record.name if not locus or locus == "<unknown name>": locus
150                 * = record.id if not locus or locus == "<unknown id>": locus =
151                 * self._get_annotation_str(record, "accession", just_first=True)\
152                 */
153                String locus;
154                try {
155                        locus = sequence.getAccession().getID();
156                } catch (Exception e) {
157                        locus = "";
158                }
159                if (locus.length() > 16) {
160                        throw new RuntimeException("Locus identifier " + locus
161                                        + " is too long");
162                }
163
164                String units = "";
165                String mol_type = "";
166                if (sequence.getCompoundSet() instanceof DNACompoundSet) {
167                        units = "bp";
168                        mol_type = "DNA";
169                } else if (sequence.getCompoundSet() instanceof RNACompoundSet) {
170                        units = "bp";
171                        mol_type = "RNA";
172                } else if (sequence.getCompoundSet() instanceof AminoAcidCompoundSet) {
173                        units = "aa";
174                        mol_type = "";
175                } else {
176                        throw new RuntimeException(
177                                        "Need a DNACompoundSet, RNACompoundSet, or an AminoAcidCompoundSet");
178                }
179
180                String division = _get_data_division(sequence);
181
182                if (seqType != null) {
183                        division = seqType;
184                }
185                assert units.length() == 2;
186
187                // the next line does not seem right.. seqType == linear
188                // uncommenting for now
189                //assert division.length() == 3;
190
191                StringBuilder sb = new StringBuilder();
192                Formatter formatter = new Formatter(sb, Locale.US);
193                formatter
194                                .format("LOCUS       %s %s %s    %s           %s %s" + lineSep,
195                                                StringManipulationHelper.padRight(locus, 16),
196                                                StringManipulationHelper.padLeft(
197                                                                Integer.toString(sequence.getLength()), 11),
198                                                units, StringManipulationHelper.padRight(mol_type, 6), division,
199                                                _get_date(sequence));
200                String output = formatter.toString();
201                formatter.close();
202                return output;
203                /*
204                 * assert len(line) == 79+1, repr(line) #plus one for new line
205                 *
206                 * assert line[12:28].rstrip() == locus, \ 'LOCUS line does not contain
207                 * the locus at the expected position:\n' + line assert line[28:29] ==
208                 * " " assert line[29:40].lstrip() == str(len(record)), \ 'LOCUS line
209                 * does not contain the length at the expected position:\n' + line
210                 *
211                 * #Tests copied from Bio.GenBank.Scanner assert line[40:44] in [' bp ',
212                 * ' aa '] , \ 'LOCUS line does not contain size units at expected
213                 * position:\n' + line assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'],
214                 * \ 'LOCUS line does not have valid strand type (Single stranded,
215                 * ...):\n' + line assert line[47:54].strip() == "" \ or
216                 * line[47:54].strip().find('DNA') != -1 \ or
217                 * line[47:54].strip().find('RNA') != -1, \ 'LOCUS line does not contain
218                 * valid sequence type (DNA, RNA, ...):\n' + line assert line[54:55] ==
219                 * ' ', \ 'LOCUS line does not contain space at position 55:\n' + line
220                 * assert line[55:63].strip() in ['', 'linear', 'circular'], \ 'LOCUS
221                 * line does not contain valid entry (linear, circular, ...):\n' + line
222                 * assert line[63:64] == ' ', \ 'LOCUS line does not contain space at
223                 * position 64:\n' + line assert line[67:68] == ' ', \ 'LOCUS line does
224                 * not contain space at position 68:\n' + line assert line[70:71] ==
225                 * '-', \ 'LOCUS line does not contain - at position 71 in date:\n' +
226                 * line assert line[74:75] == '-', \ 'LOCUS line does not contain - at
227                 * position 75 in date:\n' + line
228                 */
229        }
230
231        /**
232         * Write the original LOCUS line.
233         *
234         * @param sequence
235         */
236        private String _write_original_first_line(S sequence) {
237
238                StringBuilder sb = new StringBuilder();
239                Formatter formatter = new Formatter(sb, Locale.US);
240                formatter.format("LOCUS       %s" + lineSep,
241                                StringManipulationHelper.padRight(sequence.getOriginalHeader(), 16));
242                String output = formatter.toString();
243                formatter.close();
244                return output;
245        }
246        
247        /**
248         * This is a bit complicated due to the range of possible ways people might
249         * have done their annotation... Currently the parser uses a single string
250         * with newlines. A list of lines is also reasonable. A single (long) string
251         * is perhaps the most natural of all. This means we may need to deal with
252         * line wrapping.
253         *
254         * @param sequence
255         */
256        private String _write_comment(S sequence) {
257                List<String> comments = sequence.getNotesList();
258                String output = _write_multi_line("COMMENT", comments.remove(0));
259                for (String comment : comments) {
260                        output += _write_multi_line("", comment);
261                }
262
263                return output;
264        }
265
266        @Override
267        public String getHeader(S sequence) {
268                
269                String header;
270                
271                if (useOriginalHeader) {
272                        header = _write_original_first_line(sequence);
273                } else {
274                        header = _write_the_first_line(sequence);
275                }
276                
277                AccessionID accessionIdObj = sequence.getAccession();
278                String accession;
279                String acc_with_version;
280                
281                try {
282                        accession = accessionIdObj.getID();
283
284                        if (accessionIdObj.getIdentifier() != null) {
285                                acc_with_version = sequence.getAccession().getID() + "." + sequence.getAccession().getVersion() + " GI:"
286                                                + accessionIdObj.getIdentifier();
287
288                        } else {
289                                acc_with_version = sequence.getAccession().getID() + "." + sequence.getAccession().getVersion();
290                        }
291
292                } catch (Exception e) {
293                        acc_with_version = "";
294                        accession = "";
295                }
296                String description = sequence.getDescription();
297                if ("<unknown description>".equals(description) || description == null) {
298                        description = ".";
299                }
300                header += _write_multi_line("DEFINITION", description);
301                header += _write_multi_line("ACCESSION", accession);
302                header += _write_multi_line("VERSION", acc_with_version);
303
304                /*
305                 * #The NCBI only expect two types of link so far, #e.g. "Project:28471"
306                 * and "Trace Assembly Archive:123456" #TODO - Filter the dbxrefs list
307                 * to just these? self._write_multi_entries("DBLINK", record.dbxrefs)
308                 *
309                 * try: #List of strings #Keywords should be given separated with semi
310                 * colons, keywords = "; ".join(record.annotations["keywords"]) #with a
311                 * trailing period: if not keywords.endswith(".") : keywords += "."
312                 * except KeyError: #If no keywords, there should be just a period:
313                 * keywords = "."
314                 */
315
316                header += _write_multi_line("KEYWORDS", ".");
317
318                /*
319                 * if "segment" in record.annotations: #Deal with SEGMENT line found
320                 * only in segmented records, #e.g. AH000819 segment =
321                 * record.annotations["segment"] if isinstance(segment, list): assert
322                 * len(segment)==1, segment segment = segment[0]
323                 * self._write_single_line("SEGMENT", segment)
324                 *
325                 * self._write_multi_line("SOURCE", \ self._get_annotation_str(record,
326                 * "source"))
327                 */
328
329                header += _write_multi_line("SOURCE", sequence.getSource());
330
331                /*
332                 * #The ORGANISM line MUST be a single line, as any continuation is the
333                 * taxonomy org = self._get_annotation_str(record, "organism") if
334                 * len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: org =
335                 * org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..."
336                 * self._write_single_line("  ORGANISM", org) try: #List of strings
337                 * #Taxonomy should be given separated with semi colons, taxonomy =
338                 * "; ".join(record.annotations["taxonomy"]) #with a trailing period: if
339                 * not taxonomy.endswith(".") : taxonomy += "." except KeyError:
340                 * taxonomy = "." self._write_multi_line("", taxonomy)
341                 *
342                 * if "references" in record.annotations: self._write_references(record)
343                 */
344                if (!sequence.getNotesList().isEmpty()) {
345                        header += _write_comment(sequence);
346                }
347
348                header += "FEATURES             Location/Qualifiers" + lineSep;
349                int rec_length = sequence.getLength();
350                for (FeatureInterface<AbstractSequence<C>, C> feature : sequence
351                                .getFeatures()) {
352                        header += _write_feature(feature, rec_length);
353                }
354
355                return String.format(header);
356        }
357
358}