001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021/**
022 *
023 */
024package org.biojava.nbio.core.sequence.io;
025
026import org.biojava.nbio.core.sequence.Strand;
027import org.biojava.nbio.core.sequence.features.FeatureInterface;
028import org.biojava.nbio.core.sequence.features.Qualifier;
029import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
030import org.biojava.nbio.core.sequence.location.template.Point;
031import org.biojava.nbio.core.sequence.template.AbstractSequence;
032import org.biojava.nbio.core.sequence.template.Compound;
033import org.biojava.nbio.core.util.StringManipulationHelper;
034
035import java.util.ArrayList;
036import java.util.Collections;
037import java.util.Formatter;
038import java.util.List;
039import java.util.Locale;
040
041/**
042 * @author mckeee1
043 *
044 */
045public class GenericInsdcHeaderFormat<S extends AbstractSequence<C>, C extends Compound> {
046        protected static final int MAX_WIDTH = 80;
047        protected static final int QUALIFIER_INDENT = 21;
048        protected static final String QUALIFIER_INDENT_STR = "                     ";
049        protected static final String QUALIFIER_INDENT_TMP = "     %s                ";
050        private static final String lineSep = "%n";
051
052        /**
053         * Format a feature qualifier using the MAX_WIDTH (default 80)
054         * @param key
055         * @param value
056         * @param quote
057         */
058        private String _write_feature_qualifier(String key, String value, boolean quote) {
059                String line = "";
060                if(null == value) {
061                        line = QUALIFIER_INDENT_STR + "/" + key + lineSep;
062                        return line;
063                }
064                if(quote) {  // quote should be true for numerics
065                        line = QUALIFIER_INDENT_STR + "/" + key + "=\"" + value + "\"";
066                } else {
067                        line = QUALIFIER_INDENT_STR + "/" + key + "=" + value;
068                }
069                if(line.length() <= MAX_WIDTH) {
070                        return line + lineSep;
071                }
072                String goodlines = "";
073                while(!"".equals(line.replaceAll("^\\s+", ""))) {
074                        if(line.length() <= MAX_WIDTH) {
075                                goodlines += line + lineSep;
076                                break;
077                        }
078                        //Insert line break...
079                        int index;
080                        for(index = Math.min(line.length()-1, MAX_WIDTH); index > QUALIFIER_INDENT ; index--) {
081                                if(' ' == line.charAt(index)) {
082                                        break;
083                                }
084                        }
085                        if(' ' != line.charAt(index)) {
086                                //no nice place to break...
087                                index = MAX_WIDTH;
088                        }
089                        assert index <= MAX_WIDTH;
090                        goodlines += line.substring(0,index) + lineSep;
091                        line = QUALIFIER_INDENT_STR + line.substring(index).replaceAll("^\\s+", "");
092                }
093                return goodlines;
094        }
095        /**
096         * Split a feature location into lines (break at commas).
097         * @param location
098         */
099        private String _wrap_location(String location) {
100                int length = MAX_WIDTH - QUALIFIER_INDENT;
101                if(location.length() <= length) {
102                        return location;
103                }
104                int index = location.substring(length).lastIndexOf(",");
105                if(-1 == index) {
106                        //No good place to split (!)
107                        return location;
108                }
109                return location.substring(0,index+1) + lineSep + QUALIFIER_INDENT_STR + _wrap_location(location.substring(index+1));
110        }
111        /**
112         * Write a single SeqFeature object to features table.
113         * @param feature
114         * @param record_length
115         */
116        protected String _write_feature(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) {
117                String location = _insdc_feature_location_string(feature, record_length);
118                String f_type = feature.getType().replace(" ", "_");
119                StringBuilder sb = new StringBuilder();
120                Formatter formatter = new Formatter(sb,Locale.US);
121                formatter.format(QUALIFIER_INDENT_TMP, f_type);
122                String line = formatter.toString().substring(0, QUALIFIER_INDENT) + _wrap_location(location) + lineSep;
123                formatter.close();
124
125                //Now the qualifiers...
126                for(List<Qualifier>  qualifiers : feature.getQualifiers().values()) {
127                        for(Qualifier q : qualifiers){
128                                line += _write_feature_qualifier(q.getName(), q.getValue(), q.needsQuotes());
129                        }
130                }
131                return line;
132                /*
133                self.handle.write(line)
134                #Now the qualifiers...
135                for key, values in feature.qualifiers.items():
136                        if isinstance(values, list) or isinstance(values, tuple):
137                                for value in values:
138                                        self._write_feature_qualifier(key, value)
139                        elif values:
140                                #String, int, etc
141                                self._write_feature_qualifier(key, values)
142                        else:
143                                #e.g. a /psuedo entry
144                                self._write_feature_qualifier(key)
145                 */
146        }
147        /**
148         * Build a GenBank/EMBL location string from a SeqFeature (PRIVATE).
149
150        There is a choice of how to show joins on the reverse complement strand,
151        GenBank used "complement(join(1,10),(20,100))" while EMBL used to use
152        "join(complement(20,100),complement(1,10))" instead (but appears to have
153        now adopted the GenBank convention). Notice that the order of the entries
154        is reversed! This function therefore uses the first form. In this situation
155        we expect the parent feature and the two children to all be marked as
156        strand == -1, and in the order 0:10 then 19:100.
157
158        Also need to consider dual-strand examples like these from the Arabidopsis
159        thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650)
160        gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057)
161        which is further complicated by a splice:
162        join(complement(69611..69724),139856..140087,140625..140650)
163
164        For mixed this mixed strand feature, the parent SeqFeature should have
165        no strand (either 0 or None) while the child features should have either
166        strand +1 or -1 as appropriate, and be listed in the order given here.
167         * @param feature
168         * @param record_length
169         */
170        private String _insdc_feature_location_string(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) {
171                if(feature.getChildrenFeatures().isEmpty()) {
172                        //Non-recursive.
173                        String location = _insdc_location_string_ignoring_strand_and_subfeatures(feature.getLocations(), record_length);
174                        if(feature.getLocations().getStrand() == Strand.NEGATIVE) {
175                                StringBuilder sb = new StringBuilder();
176                                Formatter formatter = new Formatter(sb,Locale.US);
177                                formatter.format("complement(%s)", location);
178                                String output = formatter.toString();
179                                formatter.close();
180                                location = output;
181                        }
182                        return location;
183                }
184                // As noted above, treat reverse complement strand features carefully:
185                if(feature.getLocations().getStrand() == Strand.NEGATIVE) {
186                        for(FeatureInterface<?, ?> f  : feature.getChildrenFeatures()) {
187                                if(f.getLocations().getStrand() != Strand.NEGATIVE) {
188                                        StringBuilder sb = new StringBuilder();
189                                        Formatter formatter = new Formatter(sb,Locale.US);
190                                        formatter.format("Inconsistent strands: %s for parent, %s for child", feature.getLocations().getStrand(), f.getLocations().getStrand());
191                                        String output = formatter.toString();
192                                        formatter.close();
193                                        throw new RuntimeException(output);
194                                }
195                        }
196                        StringBuilder sb = new StringBuilder();
197                        Formatter formatter = new Formatter(sb,Locale.US);
198                        ArrayList<String> locations = new ArrayList<String>();
199                        for(FeatureInterface<AbstractSequence<C>, C> f  : feature.getChildrenFeatures()) {
200                                locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length));
201                        }
202                        String location = StringManipulationHelper.join(locations, ",");
203                        formatter.format("complement(%s(%s))", /*feature.location_operator*/ "join", location);
204                        String output = formatter.toString();
205                        formatter.close();
206                        return output;
207                }
208                //This covers typical forward strand features, and also an evil mixed strand:
209                StringBuilder sb = new StringBuilder();
210                Formatter formatter = new Formatter(sb,Locale.US);
211                ArrayList<String> locations = new ArrayList<String>();
212                for(FeatureInterface<AbstractSequence<C>, C> f  : feature.getChildrenFeatures()) {
213                        locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length));
214                }
215                String location =  StringManipulationHelper.join(locations, ",");
216                formatter.format("%s(%s)", /*feature.location_operator*/ "join", location);
217                String output = formatter.toString();
218                formatter.close();
219                return output;
220        }
221
222        private String _insdc_location_string_ignoring_strand_and_subfeatures(
223                        //SequenceLocation<AbstractSequence<C>, C> sequenceLocation,
224                                                AbstractLocation sequenceLocation,
225                        int record_length) {
226        /*
227        if location.ref:
228                ref = "%s:" % location.ref
229        else:
230                ref = ""
231        assert not location.ref_db
232        */
233                String ref = "";
234                if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart() == sequenceLocation.getEnd()) {
235                        //Special case, for 12:12 return 12^13
236                        //(a zero length slice, meaning the point between two letters)
237                        if(sequenceLocation.getEnd().getPosition() == record_length) {
238                                //Very special case, for a between position at the end of a
239                                //sequence (used on some circular genomes, Bug 3098) we have
240                                //N:N so return N^1
241                                StringBuilder sb = new StringBuilder();
242                                Formatter formatter = new Formatter(sb,Locale.US);
243                                formatter.format("%s%d^1", ref, record_length);
244                                String output = formatter.toString();
245                                formatter.close();
246                                return output;
247                        } else {
248                                StringBuilder sb = new StringBuilder();
249                                Formatter formatter = new Formatter(sb,Locale.US);
250                                formatter.format("%s%d^%d", ref, sequenceLocation.getStart().getPosition(), sequenceLocation.getEnd().getPosition());
251                                String output = formatter.toString();
252                                formatter.close();
253                                return output;
254                        }
255                }
256                if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart().getPosition() + 1 == sequenceLocation.getEnd().getPosition()) {
257                        //Special case, for 11:12 return 12 rather than 12..12
258                        //(a length one slice, meaning a single letter)
259                        StringBuilder sb = new StringBuilder();
260                        Formatter formatter = new Formatter(sb,Locale.US);
261                        formatter.format("%s%d", ref, sequenceLocation.getEnd().getPosition());
262                        String output = formatter.toString();
263                        formatter.close();
264                        return output;
265                } else if(sequenceLocation.getStart().isUnknown() || sequenceLocation.getEnd().isUnknown()) {
266                        //Special case for features from SwissProt/UniProt files
267                        if(sequenceLocation.getStart().isUnknown() && sequenceLocation.getEnd().isUnknown()) {
268                                throw new RuntimeException("Feature with unknown location");
269                        } else if(sequenceLocation.getStart().isUnknown()) {
270                                //Treat the unknown start position as a BeforePosition
271                                StringBuilder sb = new StringBuilder();
272                                Formatter formatter = new Formatter(sb,Locale.US);
273                                formatter.format("%s<%d..%s", ref, sequenceLocation.getEnd().getPosition(), _insdc_feature_position_string(sequenceLocation.getEnd()));
274                                String output = formatter.toString();
275                                formatter.close();
276                                return output;
277                        } else {
278                                //Treat the unknown start position as an AfterPosition
279                                StringBuilder sb = new StringBuilder();
280                                Formatter formatter = new Formatter(sb,Locale.US);
281                                formatter.format("%s%s..>%d", ref, _insdc_feature_position_string(sequenceLocation.getStart()), sequenceLocation.getStart().getPosition());
282                                String output = formatter.toString();
283                                formatter.close();
284                                return output;
285                        }
286                } else {
287                        //Typical case, e.g. 12..15 gets mapped to 11:15
288                        return ref + _insdc_feature_position_string(sequenceLocation.getStart(), 0) + ".." + _insdc_feature_position_string(sequenceLocation.getEnd());
289                }
290        }
291        private String _insdc_feature_position_string(Point location) {
292                // TODO Auto-generated method stub
293                return _insdc_feature_position_string(location, 0);
294        }
295
296        /**
297         * Build a GenBank/EMBL position string (PRIVATE).
298         * @param location
299         * @param increment
300         */
301        private String _insdc_feature_position_string(Point location, int increment) {
302                        StringBuilder sb = new StringBuilder();
303                        Formatter formatter = new Formatter(sb,Locale.US);
304                        formatter.format("%s", location.getPosition() + increment);
305                        String output = formatter.toString();
306                        formatter.close();
307                        return output;
308
309        /*
310        if isinstance(pos, SeqFeature.ExactPosition):
311                return "%i" % (pos.position+offset)
312        elif isinstance(pos, SeqFeature.WithinPosition):
313                return "(%i.%i)" % (pos.position + offset,
314                                                        pos.position + pos.extension + offset)
315        elif isinstance(pos, SeqFeature.BetweenPosition):
316                return "(%i^%i)" % (pos.position + offset,
317                                                        pos.position + pos.extension + offset)
318        elif isinstance(pos, SeqFeature.BeforePosition):
319                return "<%i" % (pos.position + offset)
320        elif isinstance(pos, SeqFeature.AfterPosition):
321                return ">%i" % (pos.position + offset)
322        elif isinstance(pos, SeqFeature.OneOfPosition):
323                return "one-of(%s)" \
324                           % ",".join([_insdc_feature_position_string(p,offset) \
325                                                   for p in pos.position_choices])
326        elif isinstance(pos, SeqFeature.AbstractPosition):
327                raise NotImplementedError("Please report this as a bug in Biopython.")
328        else:
329                raise ValueError("Expected a SeqFeature position object.")
330                 */
331        }
332
333        /**
334         * Returns a list of strings.
335         *
336         *   Any single words which are too long get returned as a whole line
337         *   (e.g. URLs) without an exception or warning.
338         * @param text
339         * @param max_len
340         */
341        protected ArrayList<String> _split_multi_line(String text, int max_len) {
342                // TODO Auto-generated method stub
343                ArrayList<String> output = new ArrayList<String>();
344                text = text.trim();
345                if(text.length() <= max_len) {
346                        output.add(text);
347                        return output;
348                }
349
350                ArrayList<String> words = new ArrayList<String>();
351                Collections.addAll(words, text.split("\\s+"));
352                while(!words.isEmpty()) {
353                        text = words.remove(0);
354                        while(!words.isEmpty() && (text.length() + 1 + words.get(0).length()) <= max_len) {
355                                text += " " + words.remove(0);
356                                text = text.trim();
357                        }
358                        output.add(text);
359                }
360                assert words.isEmpty();
361                return output;
362        }
363
364
365}