#! /usr/bin/perl

#  po-debiandoc-fix    - fix Debiandoc SGML to process
#                        with po-debiandoc tools
#  Copyright (C) 2001-2002  Denis Barbier <barbier@debian.org>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or (at
#  your option) any later version.
#
#  This program is distributed in the hope that it will be useful, but
#  WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the
#  Free Software Foundation, Inc.,
#  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
#   This script is part of po-debiandoc

use strict;
use IO::File;
use Getopt::Long;

use vars(qw($opt_h $opt_v $opt_f $opt_r $opt_N));

sub usage {
        print STDERR <<"EOT";
Usage: $0 [-h] [-v] [-f LEVEL] [-r] inputfile
Options:
  -h,  --help           display this help message
  -v,  --verbose        enable verbose mode
  -r,  --no-marked-section-delimiters
                        remove marked-section start and end strings
  -f,  --fragment=LEVEL use when input is a fragment file
                        (chapter or appendix)
  -N,  --fix-newline    add a space between end tag delimiter and newline
EOT
        exit($_[0]);
}

sub verbose {
        return unless $opt_v;
        print STDERR "po-debiandoc-fix:Verbose: ", shift, "\n";
}

$Getopt::Long::bundling = 1;
$Getopt::Long::getopt_compat = 0;
if (not Getopt::Long::GetOptions(qw(
                        h|help
                        v|verbose
                        r|no-marked-section-delimiters
                        f|fragment=s
                        N|fix-newline
                ))) {
        warn "Try `$0 --help' for more information.\n";
        exit(1);
}

usage(0) if $opt_h;
usage(1) unless $#ARGV <= 0;

my $orig = new IO::File;
if ($#ARGV == -1 || $ARGV[0] eq '-') {
        die "Unable to read from stdin"
                unless $orig->open("-");
} else {
        die "Unable to read from $ARGV[0]"
                unless $orig->open("< $ARGV[0]");
}

#
#  Get prolog
#
sub GetProlog {
        my $text = shift;
        if ($text =~ s/<debiandoc.*//si) {
                return $text;
        }
}

#
#  Get entities for marked sections
#
sub GetEntities {
        my $text = shift;
        my %entities = ();
        while ($text =~ m/<!\[\s*([^\[]+)\[/g) {
                my $names = $1;
                foreach my $e (split('\%', $names)) {
                        $e =~ s/^\s*//;
                        $e =~ s/\s*$//;
                        $entities{$e} = 1 if $e =~ m/\S/;
                }
        }
        return %entities;
}

sub RemoveMarkup {
        my $text = shift;
        $text =~ s/<!\[\s*[^\[]+\[\s*//g;
        $text =~ s/<!--[^<]*?-->//g;
        $text =~ s/\]\s*\]*>//g;
        #   Remove end tags, which may belong to previous paragraph
        1 while ($text =~ s|^\s*</\w+>||s);
        return $text;
}

my ($text, $iflag, %entities, $prolog, $fh, $tmpfile);
{
        local $/ = undef;
        $text = <$orig>;
}
$orig->close() or die "Unable to close $ARGV[0]";

%entities = GetEntities($text) unless $opt_r;
if (%entities) {
        $iflag = '-i '.join(' -i ', keys %entities);
}
$prolog = GetProlog($text);
if ($opt_f) {
        die "po-debiandoc-fix:Error: <debiandoc> found, -f flag is forbidden\n"
                if $prolog ne '';
        $prolog = '<!DOCTYPE debiandoc SYSTEM [';
        foreach (keys %entities) {
                $prolog .= '<!entity % '.$_.' "INCLUDE">';
        }
        $prolog .= ']>';
} else {
        die "po-debiandoc-fix:Error: <debiandoc> not found, -f flag is needed\n"
                if $prolog eq '';
}

$text =~ s/&/{po-ddf-amp}/g;
if ($opt_r) {
        $text =~ s/<!\[\s*[^\[]+\[//g;
        $text =~ s/\]\s*\]>//g;
}

my $fh = new IO::File;
$tmpfile = ($ENV{TMPDIR} || '/tmp/')."po-ddf-$$";
sysopen($fh, $tmpfile, O_CREAT | O_EXCL | O_RDWR, 0600)
        or die "Unable to open temporary file";

if ($opt_f) {
        print $fh '<debiandoc> <book> <titlepag> <title><author> <name>[po-debiandoc-dummy]</name> </titlepag>';
        print $fh '<chapt><heading></heading></chapt>' if $opt_f eq 'appendix';
        print $fh $text;
        print $fh '</book></debiandoc>';
} else {
        print $fh $text;
}
close ($fh);

open (TMP, "spam $iflag -momittag $tmpfile 2>/dev/null |");
{
        local $/ = undef;
        $text = <TMP>;
}
close (TMP);
unlink($tmpfile);

#   nsgmls gobbles newline when following end tags, so add a space
#   which is removed after
$text =~ s/(> *)$/$1 /gm if $opt_N;

#   Escape inline marked sections
my ($textN, $last);
while ($text =~ m|\G(.*?)<(!\[\s*[^\[]+\[\s*)(.*?)(\]\s*\]\s*)>|sg) {
        $last = pos($text);
        my ($pre, $begin, $mid, $end) = ($1, $2, $3, $4);
        if ($mid =~ s/(.*)<(!\[\s*[^\[]+\[\s*)(.*)/$3/s) {
                $pre .= '<'.$begin.$1;
                $begin = $2;
        }
        $textN .= $pre;
        my $mid_text = RemoveMarkup($mid);
        if ($mid_text =~ m{^\s*<(p|heading|chapt|appendix|sect|sect[1-4])\b}si) {
                $textN .= '<';
        } else {
                $textN .= '{po-ddf-lt}';
        }
        $textN .= $begin.$mid.$end;
        if ($mid_text =~ m{</(p|heading|chapt|appendix|sect|sect[1-4])>\s*$}si) {
                $textN .= '>';
        } else {
                $textN .= '{po-ddf-gt}';
        }
}
$text = $textN.substr($text, $last);

#   Move around missing optional end tags
1 while ($text =~ s{(<!\[\s*[^\[]+\[|\]\s*\]>|<!--[^<]*?-->)(\s*</(?:P|HEADING|CHAPT|APPENDIX|SECT|SECT[1-4])>)}{$2$1}s);
1 while ($text =~ s{(\s+)(</(?:p|heading|chapt|appendix|sect|sect[1-4])>)}{$2$1}i);
#   convert into lowercase letters
1 while ($text =~ s{(</?(?:P|HEADING|CHAPT|APPENDIX|SECT|SECT[1-4])>)}{\L$1\E});

print $prolog.$text;

1;
