diff options
Diffstat (limited to 'manual/xmlformat/xmlformat.pl')
-rwxr-xr-x | manual/xmlformat/xmlformat.pl | 1745 |
1 files changed, 1745 insertions, 0 deletions
diff --git a/manual/xmlformat/xmlformat.pl b/manual/xmlformat/xmlformat.pl new file mode 100755 index 0000000000..877f5f110f --- /dev/null +++ b/manual/xmlformat/xmlformat.pl @@ -0,0 +1,1745 @@ +#! /usr/bin/perl -w +# vim:set ts=2 sw=2 expandtab: + +# xmlformat - configurable XML file formatter/pretty-printer + +# Copyright (c) 2004, 2005 Kitebird, LLC. All rights reserved. +# Some portions are based on the REX shallow XML parser, which +# is Copyright (c) 1998, Robert D. Cameron. These include the +# regular expression parsing variables and the shallow_parse() +# method. +# This software is licensed as described in the file LICENSE, +# which you should have received as part of this distribution. + +# Syntax: xmlformat [config-file] xml-file + +# Default config file is $ENV{XMLFORMAT_CONF} or ./xmlformat.conf, in that +# order. + +# Paul DuBois +# paul@kitebird.com +# 2003-12-14 + +# The input document first is parsed into a list of strings. Each string +# represents one of the following: +# - text node +# - processing instruction (the XML declaration is treated as a PI) +# - comment +# - CDATA section +# - DOCTYPE declaration +# - element tag (either <abc>, </abc>, or <abc/>), *including attributes* + +# Entities are left untouched. They appear in their original form as part +# of the text node in which they occur. + +# The list of strings then is converted to a hierarchical structure. +# The document top level is represented by a reference to a list. +# Each list element is a reference to a node -- a hash that has "type" +# and "content" key/value pairs. The "type" key indicates the node +# type and has one of the following values: + +# "text" - text node +# "pi" - processing instruction node +# "comment" - comment node +# "CDATA" - CDATA section node +# "DOCTYPE" - DOCTYPE node +# "elt" - element node + +# (For purposes of this program, it's really only necessary to have "text", +# "elt", and "other". The types other than "text" and "elt" currently are +# all treated the same way.) + +# For all but element nodes, the "content" value is the text of the node. + +# For element nodes, the "content" hash is a reference to a list of +# nodes for the element's children. In addition, an element node has +# three additional key/value pairs: +# - The "name" value is the tag name within the opening tag, minus angle +# brackets or attributes. +# - The "open_tag" value is the full opening tag, which may also be the +# closing tag. +# - The "close_tag" value depends on the opening tag. If the open tag is +# "<abc>", the close tag is "</abc>". If the open tag is "<abc/>", the +# close tag is the empty string. + +# If the tree structure is converted back into a string with +# tree_stringify(), the result can be compared to the input file +# as a regression test. The string should be identical to the original +# input document. + +use strict; + +use Getopt::Long; +$Getopt::Long::ignorecase = 0; # options are case sensitive +$Getopt::Long::bundling = 1; # allow short options to be bundled + +my $PROG_NAME = "xmlformat"; +my $PROG_VERSION = "1.04"; +my $PROG_LANG = "Perl"; + +# ---------------------------------------------------------------------- + +package XMLFormat; + +use strict; + +# ---------------------------------------------------------------------- + +# Regular expressions for parsing document components. Based on REX. + +# SPE = shallow parsing expression +# SE = scanning expression +# CE = completion expression +# RSB = right square brackets +# QM = question mark + +my $TextSE = "[^<]+"; +my $UntilHyphen = "[^-]*-"; +my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; +my $CommentCE = "$Until2Hyphens>?"; +my $UntilRSBs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+"; +my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; +my $S = "[ \\n\\t\\r]+"; +my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; +my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; +my $Name = "(?:$NameStrt)(?:$NameChar)*"; +my $QuoteSE = "\"[^\"]*\"|'[^']*'"; +my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; +my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; +my $S1 = "[\\n\\r\\t ]"; +my $UntilQMs = "[^?]*\\?+"; +my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; +my $DT_ItemSE = +"<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; +my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*\\](?:$S)?)?>?"; +my $DeclCE = +"--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; +my $PI_CE = "$Name(?:$PI_Tail)?"; +my $EndTagCE = "$Name(?:$S)?>?"; +my $AttValSE = "\"[^<\"]*\"|'[^<']*'"; +my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?"; +my $MarkupSPE = +"<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; +my $XML_SPE = "$TextSE|$MarkupSPE"; + +# ---------------------------------------------------------------------- + +# Allowable options and their possible values: +# - The keys of this hash are the allowable option names +# - The value for each key is list of allowable option values +# - If the value is undef, the option value must be numeric +# If any new formatting option is added to this program, it +# must be specified here, *and* a default value for it should +# be listed in the *DOCUMENT and *DEFAULT pseudo-element +# option hashes. + +my %opt_list = ( + "format" => [ "block", "inline", "verbatim" ], + "normalize" => [ "yes", "no" ], + "subindent" => undef, + "wrap-length" => undef, + "entry-break" => undef, + "exit-break" => undef, + "element-break" => undef +); + +# Object creation: set up the default formatting configuration +# and variables for maintaining input and output document. + +sub new +{ +my $type = shift; + + my $self = {}; + + # Formatting options for each element. + + $self->{elt_opts} = { }; + + # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can + # be overridden in the configuration file, but the options must also be + # built in to make sure they exist if not specified in the configuration + # file. Each of the structures must have a value for every option. + + # Options for top-level document children. + # - Do not change entry-break: 0 ensures no extra newlines before + # first element of output. + # - Do not change exit-break: 1 ensures a newline after final element + # of output document. + # - It's probably best not to change any of the others, except perhaps + # if you want to increase the element-break. + + $self->{elt_opts}->{"*DOCUMENT"} = { + "format" => "block", + "normalize" => "no", + "subindent" => 0, + "wrap-length" => 0, + "entry-break" => 0, # do not change + "exit-break" => 1, # do not change + "element-break" => 1 + }; + + # Default options. These are used for any elements in the document + # that are not specified explicitly in the configuration file. + + $self->{elt_opts}->{"*DEFAULT"} = { + "format" => "block", + "normalize" => "no", + "subindent" => 1, + "wrap-length" => 0, + "entry-break" => 1, + "exit-break" => 1, + "element-break" => 1 + }; + + # Run the *DOCUMENT and *DEFAULT options through the option-checker + # to verify that the built-in values are legal. + + my $err_count = 0; + + foreach my $elt_name (keys (%{$self->{elt_opts}})) # ... for each element + { + # Check each option for element + while (my ($opt_name, $opt_val) = each (%{$self->{elt_opts}->{$elt_name}})) + { + my $err_msg; + + ($opt_val, $err_msg) = check_option ($opt_name, $opt_val); + if (!defined ($err_msg)) + { + $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val; + } + else + { + warn "LOGIC ERROR: $elt_name default option is invalid\n"; + warn "$err_msg\n"; + ++$err_count; + } + } + } + + # Make sure that the every option is represented in the + # *DOCUMENT and *DEFAULT structures. + + foreach my $opt_name (keys (%opt_list)) + { + foreach my $elt_name (keys (%{$self->{elt_opts}})) + { + if (!exists ($self->{elt_opts}->{$elt_name}->{$opt_name})) + { + warn "LOGIC ERROR: $elt_name has no default '$opt_name' option\n"; + ++$err_count; + } + } + } + + die "Cannot continue; internal default formatting options must be fixed\n" + if $err_count > 0; + + bless $self, $type; # bless object and return it +} + +# Initialize the variables that are used per-document + +sub init_doc_vars +{ +my $self = shift; + + # Elements that are used in the document but not named explicitly + # in the configuration file. + + $self->{unconf_elts} = { }; + + # List of tokens for current document. + + $self->{tokens} = [ ]; + + # List of line numbers for each token + + $self->{line_num} = [ ]; + + # Document node tree (constructed from the token list). + + $self->{tree} = [ ]; + + # Variables for formatting operations: + # out_doc = resulting output document (constructed from document tree) + # pending = array of pending tokens being held until flushed + + $self->{out_doc} = ""; + $self->{pending} = [ ]; + + # Inline elements within block elements are processed using the + # text normalization (and possible line-wrapping) values of their + # enclosing block. Blocks and inlines may be nested, so we maintain + # a stack that allows the normalize/wrap-length values of the current + # block to be determined. + + $self->{block_name_stack} = [ ]; # for debugging + $self->{block_opts_stack} = [ ]; + + # A similar stack for maintaining each block's current break type. + + $self->{block_break_type_stack} = [ ]; +} + +# Accessors for token list and resulting output document + +sub tokens +{ +my $self = shift; + + return $self->{tokens}; +} + +sub out_doc +{ +my $self = shift; + + return $self->{out_doc}; +} + + +# Methods for adding strings to output document or +# to the pending output array + +sub add_to_doc +{ +my ($self, $str) = @_; + + $self->{out_doc} .= $str; +} + +sub add_to_pending +{ +my ($self, $str) = @_; + + push (@{$self->{pending}}, $str); +} + + +# Block stack mainenance methods + +# Push options onto or pop options off from the stack. When doing +# this, also push or pop an element onto the break-level stack. + +sub begin_block +{ +my ($self, $name, $opts) = @_; + + push (@{$self->{block_name_stack}}, $name); + push (@{$self->{block_opts_stack}}, $opts); + push (@{$self->{block_break_type_stack}}, "entry-break"); +} + +sub end_block +{ +my $self = shift; + + pop (@{$self->{block_name_stack}}); + pop (@{$self->{block_opts_stack}}); + pop (@{$self->{block_break_type_stack}}); +} + +# Return the current block's normalization status or wrap length + +sub block_normalize +{ +my $self = shift; + + my $size = @{$self->{block_opts_stack}}; + my $opts = $self->{block_opts_stack}->[$size-1]; + return $opts->{normalize} eq "yes"; +} + +sub block_wrap_length +{ +my $self = shift; + + my $size = @{$self->{block_opts_stack}}; + my $opts = $self->{block_opts_stack}->[$size-1]; + return $opts->{"wrap-length"}; +} + +# Set the current block's break type, or return the number of newlines +# for the block's break type + +sub set_block_break_type +{ +my ($self, $type) = @_; + + my $size = @{$self->{block_break_type_stack}}; + $self->{block_break_type_stack}->[$size-1] = $type; +} + +sub block_break_value +{ +my $self = shift; + + my $size = @{$self->{block_opts_stack}}; + my $opts = $self->{block_opts_stack}->[$size-1]; + $size = @{$self->{block_break_type_stack}}; + my $type = $self->{block_break_type_stack}->[$size-1]; + return $opts->{$type}; +} + + +# ---------------------------------------------------------------------- + +# Read configuration information. For each element, construct a hash +# containing a hash key and value for each option name and value. +# After reading the file, fill in missing option values for +# incomplete option structures using the *DEFAULT options. + +sub read_config +{ +my $self = shift; +my $conf_file = shift; +my @elt_names = (); +my $err_msg; +my $in_continuation = 0; +my $saved_line = ""; + + open (FH, $conf_file) or die "Cannot read config file $conf_file: $!\n"; + while (<FH>) + { + chomp; + + next if /^\s*($|#)/; # skip blank lines, comments + if ($in_continuation) + { + $_ = $saved_line . " " . $_; + $saved_line = ""; + $in_continuation = 0; + } + if (!/^\s/) + { + # Line doesn't begin with whitespace, so it lists element names. + # Names are separated by whitespace or commas, possibly followed + # by a continuation character or a comment. + if (/\\$/) + { + s/\\$//; # remove continuation character + $saved_line = $_; + $in_continuation = 1; + next; + } + s/\s*#.*$//; # remove any trailing comment + @elt_names = split (/[\s,]+/, $_); + # make sure each name has an entry in the elt_opts structure + foreach my $elt_name (@elt_names) + { + $self->{elt_opts}->{$elt_name} = { } + unless exists ($self->{elt_opts}->{$elt_name}); + } + } + else + { + # Line begins with whitespace, so it contains an option + # to apply to the current element list, possibly followed by + # a comment. First check that there is a current list. + # Then parse the option name/value. + + die "$conf_file:$.: Option setting found before any " + . "elements were named.\n" + if !@elt_names; + s/\s*#.*$//; + my ($opt_name, $opt_val) = /^\s+(\S+)(?:\s+|\s*=\s*)(\S+)$/; + die "$conf_file:$.: Malformed line: $_\n" unless defined ($opt_val); + + # Check option. If illegal, die with message. Otherwise, + # add option to each element in current element list + + ($opt_val, $err_msg) = check_option ($opt_name, $opt_val); + die "$conf_file:$.: $err_msg\n" if defined ($err_msg); + foreach my $elt_name (@elt_names) + { + $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val; + } + } + } + close (FH); + + # For any element that has missing option values, fill in the values + # using the options for the *DEFAULT pseudo-element. This speeds up + # element option lookups later. It also makes it unnecessary to test + # each option to see if it's defined: All element option structures + # will have every option defined. + + my $def_opts = $self->{elt_opts}->{"*DEFAULT"}; + + foreach my $elt_name (keys (%{$self->{elt_opts}})) + { + next if $elt_name eq "*DEFAULT"; + foreach my $opt_name (keys (%{$def_opts})) + { + next if exists ($self->{elt_opts}->{$elt_name}->{$opt_name}); # already set + $self->{elt_opts}->{$elt_name}->{$opt_name} = $def_opts->{$opt_name}; + } + } +} + + +# Check option name to make sure it's legal. Check the value to make sure +# that it's legal for the name. Return a two-element array: +# (value, undef) if the option name and value are legal. +# (undef, message) if an error was found; message contains error message. +# For legal values, the returned value should be assigned to the option, +# because it may get type-converted here. + +sub check_option +{ +my ($opt_name, $opt_val) = @_; + + # - Check option name to make sure it's a legal option + # - Then check the value. If there is a list of values + # the value must be one of them. Otherwise, the value + # must be an integer. + + return (undef, "Unknown option name: $opt_name") + unless exists ($opt_list{$opt_name}); + my $allowable_val = $opt_list{$opt_name}; + if (defined ($allowable_val)) + { + return (undef, "Unknown '$opt_name' value: $opt_val") + unless grep (/^$opt_val$/, @{$allowable_val}); + } + else # other options should be numeric + { + # "$opt_val" converts $opt_val to string for pattern match + return (undef, "'$opt_name' value ($opt_val) should be an integer") + unless "$opt_val" =~ /^\d+$/; + } + return ($opt_val, undef); +} + + +# Return hash of option values for a given element. If no options are found: +# - Add the element name to the list of unconfigured options. +# - Assign the default options to the element. (This way the test for the +# option fails only once.) + +sub get_opts +{ +my $self = shift; +my $elt_name = shift; + + my $opts = $self->{elt_opts}->{$elt_name}; + if (!defined ($opts)) + { + $self->{unconf_elts}->{$elt_name} = 1; + $opts = $self->{elt_opts}->{$elt_name} = $self->{elt_opts}->{"*DEFAULT"}; + } + return $opts; +} + + +# Display contents of configuration options to be used to process document. +# For each element named in the elt_opts structure, display its format +# type, and those options that apply to the type. + +sub display_config +{ +my $self = shift; +# Format types and the additional options that apply to each type +my $format_opts = { + "block" => [ + "entry-break", + "element-break", + "exit-break", + "subindent", + "normalize", + "wrap-length" + ], + "inline" => [ ], + "verbatim" => [ ] +}; + + foreach my $elt_name (sort (keys (%{$self->{elt_opts}}))) + { + print "$elt_name\n"; + my %opts = %{$self->{elt_opts}->{$elt_name}}; + my $format = $opts{format}; + # Write out format type, then options that apply to the format type + print " format = $format\n"; + foreach my $opt_name (@{$format_opts->{$format}}) + { + print " $opt_name = $opts{$opt_name}\n"; + } + print "\n"; + } +} + + +# Display the list of elements that are used in the document but not +# configured in the configuration file. + +# Then re-unconfigure the elements so that they won't be considered +# as configured for the next document, if there is one. + +sub display_unconfigured_elements +{ +my $self = shift; + + my @elts = keys (%{$self->{unconf_elts}}); + if (@elts == 0) + { + print "The document contains no unconfigured elements.\n"; + } + else + { + print "The following document elements were assigned no formatting options:\n"; + foreach my $line ($self->line_wrap ([ join (" ", sort (@elts)) ], 0, 0, 65)) + { + print "$line\n"; + } + } + + foreach my $elt_name (@elts) + { + delete ($self->{elt_opts}->{$elt_name}); + } +} + +# ---------------------------------------------------------------------- + +# Main document processing routine. +# - Argument is a string representing an input document +# - Return value is the reformatted document, or undef. An undef return +# signifies either that an error occurred, or that some option was +# given that suppresses document output. In either case, don't write +# any output for the document. Any error messages will already have +# been printed when this returns. + +sub process_doc +{ +my $self = shift; +my ($doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts) = @_; +my $str; + + $self->init_doc_vars (); + + # Perform lexical parse to split document into list of tokens + warn "Parsing document...\n" if $verbose; + $self->shallow_parse ($doc); + + if ($check_parser) + { + warn "Checking parser...\n" if $verbose; + # concatentation of tokens should be identical to original document + if ($doc eq join ("", @{$self->tokens ()})) + { + print "Parser is okay\n"; + } + else + { + print "PARSER ERROR: document token concatenation differs from document\n"; + } + return undef; + } + + # Assign input line number to each token + $self->assign_line_numbers (); + + # Look for and report any error tokens returned by parser + warn "Checking document for errors...\n" if $verbose; + if ($self->report_errors () > 0) + { + warn "Cannot continue processing document.\n"; + return undef; + } + + # Convert the token list to a tree structure + warn "Converting document tokens to tree...\n" if $verbose; + if ($self->tokens_to_tree () > 0) + { + warn "Cannot continue processing document.\n"; + return undef; + } + + # Check: Stringify the tree to convert it back to a single string, + # then compare to original document string (should be identical) + # (This is an integrity check on the validity of the to-tree and stringify + # operations; if one or both do not work properly, a mismatch should occur.) + #$str = $self->tree_stringify (); + #print $str; + #warn "ERROR: mismatch between document and resulting string\n" if $doc ne $str; + + # Canonize tree to remove extraneous whitespace + warn "Canonizing document tree...\n" if $verbose; + $self->tree_canonize (); + + if ($canonize_only) + { + print $self->tree_stringify () . "\n"; + return undef; + } + + # One side-effect of canonizing the tree is that the formatting + # options are looked up for each element in the document. That + # causes the list of elements that have no explicit configuration + # to be built. Display the list and return if user requested it. + + if ($show_unconf_elts) + { + $self->display_unconfigured_elements (); + return undef; + } + + # Format the tree to produce formatted XML as a single string + warn "Formatting document tree...\n" if $verbose; + $self->tree_format (); + + # If the document is not empty, add a newline and emit a warning if + # reformatting failed to add a trailing newline. This shouldn't + # happen if the *DOCUMENT options are set up with exit-break = 1, + # which is the reason for the warning rather than just silently + # adding the newline. + + $str = $self->out_doc (); + if ($str ne "" && $str !~ /\n$/) + { + warn "LOGIC ERROR: trailing newline had to be added\n"; + $str .= "\n"; + } + + return $str; +} + +# ---------------------------------------------------------------------- + +# Parse XML document into array of tokens and store array + +sub shallow_parse +{ +my ($self, $xml_document) = @_; + + $self->{tokens} = [ $xml_document =~ /$XML_SPE/g ]; +} + +# ---------------------------------------------------------------------- + +# Extract a tag name from a tag and return it. + +# Dies if the tag cannot be found, because this is supposed to be +# called only with a legal tag. + +sub extract_tag_name +{ +my $tag = shift; + + die "Cannot find tag name in tag: $tag\n" unless $tag =~ /^<\/?($Name)/; + return $1; +} + +# ---------------------------------------------------------------------- + +# Assign an input line number to each token. The number indicates +# the line number on which the token begins. + +sub assign_line_numbers +{ +my $self = shift; +my $line_num = 1; + + $self->{line_num} = [ ]; + for (my $i = 0; $i < @{$self->{tokens}}; $i++) + { + my $token = $self->{tokens}->[$i]; + push (@{$self->{line_num}}, $line_num); + # count newlines and increment line counter (tr returns no. of matches) + $line_num += ($token =~ tr/\n/\n/); + } +} + +# ---------------------------------------------------------------------- + +# Check token list for errors and report any that are found. Error +# tokens are those that begin with "<" but do not end with ">". + +# Returns the error count. + +# Does not modify the original token list. + +sub report_errors +{ +my $self = shift; +my $err_count = 0; + + for (my $i = 0; $i < @{$self->{tokens}}; $i++) + { + my $token = $self->{tokens}->[$i]; + if ($token =~ /^</ && $token !~ />$/) + { + my $line_num = $self->{line_num}->[$i]; + warn "Malformed token at line $line_num, token " . ($i+1) . ": $token\n"; + ++$err_count; + } + } + warn "Number of errors found: $err_count\n" if $err_count > 0; + return $err_count; +} + +# ---------------------------------------------------------------------- + +# Helper routine to print tag stack for tokens_to_tree + +sub print_tag_stack +{ +my ($label, @stack) = @_; + if (@stack < 1) + { + warn " $label: none\n"; + } + else + { + warn " $label:\n"; + for (my $i = 0; $i < @stack; $i++) + { + warn " ", ($i+1), ": ", $stack[$i], "\n"; + } + } +} + +# Convert the list of XML document tokens to a tree representation. +# The implementation uses a loop and a stack rather than recursion. + +# Does not modify the original token list. + +# Returns an error count. + +sub tokens_to_tree +{ +my $self = shift; + + my @tag_stack = (); # stack for element tags + my @children_stack = (); # stack for lists of children + my $children = [ ]; # current list of children + my $err_count = 0; + + for (my $i = 0; $i < @{$self->{tokens}}; $i++) + { + my $token = $self->{tokens}->[$i]; + my $line_num = $self->{line_num}->[$i]; + my $tok_err = "Error near line $line_num, token " . ($i+1) . " ($token)"; + if ($token !~ /^</) # text + { + push (@{$children}, text_node ($token)); + } + elsif ($token =~ /^<!--/) # comment + { + push (@{$children}, comment_node ($token)); + } + elsif ($token =~ /^<\?/) # processing instruction + { + push (@{$children}, pi_node ($token)); + } + elsif ($token =~ /^<!DOCTYPE/) # DOCTYPE + { + push (@{$children}, doctype_node ($token)); + } + elsif ($token =~ /^<!\[/) # CDATA + { + push (@{$children}, cdata_node ($token)); + } + elsif ($token =~ /^<\//) # element close tag + { + if (!@tag_stack) + { + warn "$tok_err: Close tag w/o preceding open tag; malformed document?\n"; + ++$err_count; + next; + } + if (!@children_stack) + { + warn "$tok_err: Empty children stack; malformed document?\n"; + ++$err_count; + next; + } + my $tag = pop (@tag_stack); + my $open_tag_name = extract_tag_name ($tag); + my $close_tag_name = extract_tag_name ($token); + if ($open_tag_name ne $close_tag_name) + { + warn "$tok_err: Tag mismatch; malformed document?\n"; + warn " open tag: $tag\n"; + warn " close tag: $token\n"; + print_tag_stack ("enclosing tags", @tag_stack); + ++$err_count; + next; + } + my $elt = element_node ($tag, $token, $children); + $children = pop (@children_stack); + push (@{$children}, $elt); + } + else # element open tag + { + # If we reach here, we're seeing the open tag for an element: + # - If the tag is also the close tag (e.g., <abc/>), close the + # element immediately, giving it an empty child list. + # - Otherwise, push tag and child list on stacks, begin new child + # list for element body. + if ($token =~ /\/>$/) # tag is of form <abc/> + { + push (@{$children}, element_node ($token, "", [ ])); + } + else # tag is of form <abc> + { + push (@tag_stack, $token); + push (@children_stack, $children); + $children = [ ]; + } + } + } + + # At this point, the stacks should be empty if the document is + # well-formed. + + if (@tag_stack) + { + warn "Error at EOF: Unclosed tags; malformed document?\n"; + print_tag_stack ("unclosed tags", @tag_stack); + ++$err_count; + } + if (@children_stack) + { + warn "Error at EOF: Unprocessed child elements; malformed document?\n"; +# TODO: print out info about them + ++$err_count; + } + + $self->{tree} = $children; + return $err_count; +} + + +# Node-generating helper methods for tokens_to_tree + +# Generic node generator + +sub node { return { "type" => $_[0], "content" => $_[1] }; } + +# Generators for specific non-element nodes + +sub text_node { return node ("text", $_[0]); } +sub comment_node { return node ("comment", $_[0]); } +sub pi_node { return node ("pi", $_[0]); } +sub doctype_node { return node ("DOCTYPE", $_[0]); } +sub cdata_node { return node ("CDATA", $_[0]); } + +# For an element node, create a standard node with the type and content +# key/value pairs. Then add pairs for the "name", "open_tag", and +# "close_tag" hash keys. + +sub element_node +{ +my ($open_tag, $close_tag, $children) = @_; + + my $elt = node ("elt", $children); + # name is the open tag with angle brackets and attibutes stripped + $elt->{name} = extract_tag_name ($open_tag); + $elt->{open_tag} = $open_tag; + $elt->{close_tag} = $close_tag; + return $elt; +} + +# ---------------------------------------------------------------------- + +# Convert the given XML document tree (or subtree) to string form by +# concatentating all of its components. Argument is a reference +# to a list of nodes at a given level of the tree. + +# Does not modify the node list. + +sub tree_stringify +{ +my $self = shift; +my $children = shift || $self->{tree}; # use entire tree if no arg; +my $str = ""; + + for (my $i = 0; $i < @{$children}; $i++) + { + my $child = $children->[$i]; + + # - Elements have list of child nodes as content (process recursively) + # - All other node types have text content + + if ($child->{type} eq "elt") + { + $str .= $child->{open_tag} + . $self->tree_stringify ($child->{content}) + . $child->{close_tag}; + } + else + { + $str .= $child->{content}; + } + } + return $str; +} + +# ---------------------------------------------------------------------- + + +# Put tree in "canonical" form by eliminating extraneous whitespace +# from element text content. + +# $children is a list of child nodes + +# This function modifies the node list. + +# Canonizing occurs as follows: +# - Comment, PI, DOCTYPE, and CDATA nodes remain untouched +# - Verbatim elements and their descendants remain untouched +# - Within non-normalized block elements: +# - Delete all-whitespace text node children +# - Leave other text node children untouched +# - Within normalized block elements: +# - Convert runs of whitespace (including line-endings) to single spaces +# - Trim leading whitespace of first text node +# - Trim trailing whitespace of last text node +# - Trim whitespace that is adjacent to a verbatim or non-normalized +# sub-element. (For example, if a <programlisting> is followed by +# more text, delete any whitespace at beginning of that text.) +# - Within inline elements: +# - Normalize the same way as the enclosing block element, with the +# exception that a space at the beginning or end is not removed. +# (Otherwise, <para>three<literal> blind </literal>mice</para> +# would become <para>three<literal>blind</literal>mice</para>.) + +sub tree_canonize +{ +my $self = shift; + + $self->{tree} = $self->tree_canonize2 ($self->{tree}, "*DOCUMENT"); +} + + +sub tree_canonize2 +{ +my $self = shift; +my $children = shift; +my $par_name = shift; + + # Formatting options for parent + my $par_opts = $self->get_opts ($par_name); + + # If parent is a block element, remember its formatting options on + # the block stack so they can be used to control canonization of + # inline child elements. + + $self->begin_block ($par_name, $par_opts) if $par_opts->{format} eq "block"; + + # Iterate through list of child nodes to preserve, modify, or + # discard whitespace. Return resulting list of children. + + # Canonize element and text nodes. Leave everything else (comments, + # processing instructions, etc.) untouched. + + my @new_children = (); + + while (@{$children}) + { + my $child = shift (@{$children}); + + if ($child->{type} eq "elt") + { + # Leave verbatim elements untouched. For other element nodes, + # canonize child list using options appropriate to element. + + if ($self->get_opts ($child->{name})->{format} ne "verbatim") + { + $child->{content} = $self->tree_canonize2 ($child->{content}, + $child->{name}); + } + } + elsif ($child->{type} eq "text") + { + # Delete all-whitespace node or strip whitespace as appropriate. + + # Paranoia check: We should never get here for verbatim elements, + # because normalization is irrelevant for them. + + die "LOGIC ERROR: trying to canonize verbatim element $par_name!\n" + if $par_opts->{format} eq "verbatim"; + + if (!$self->block_normalize ()) + { + # Enclosing block is not normalized: + # - Delete child all-whitespace text nodes. + # - Leave other text nodes untouched. + + next if $child->{content} =~ /^\s*$/; + } + else + { + # Enclosing block is normalized, so normalize this text node: + # - Convert runs of whitespace characters (including + # line-endings characters) to single spaces. + # - Trim leading whitespace if this node is the first child + # of a block element or it follows a non-normalized node. + # - Trim leading whitespace if this node is the last child + # of a block element or it precedes a non-normalized node. + + # These are nil if there is no prev or next child + my $prev_child = $new_children[$#new_children]; + my $next_child = $children->[0]; + + $child->{content} =~ s/\s+/ /g; + $child->{content} =~ s/^ // + if (!defined ($prev_child) && $par_opts->{format} eq "block") + || $self->non_normalized_node ($prev_child); + $child->{content} =~ s/ $// + if (!defined ($next_child) && $par_opts->{format} eq "block") + || $self->non_normalized_node ($next_child); + + # If resulting text is empty, discard the node. + next if $child->{content} =~ /^$/; + } + } + push (@new_children, $child); + } + + # Pop block stack if parent was a block element + $self->end_block () if $par_opts->{format} eq "block"; + + return \@new_children; +} + + +# Helper function for tree_canonize(). + +# Determine whether a node is normalized. This is used to check +# the node that is adjacent to a given text node (either previous +# or following). +# - No is node is nil +# - No if the node is a verbatim element +# - If the node is a block element, yes or no according to its +# normalize option +# - No if the node is an inline element. Inlines are normalized +# if the parent block is normalized, but this method is not called +# except while examinine normalized blocks. So its inline children +# are also normalized. +# - No if node is a comment, PI, DOCTYPE, or CDATA section. These are +# treated like verbatim elements. + +sub non_normalized_node +{ +my $self = shift; +my $node = shift; + + return 0 if !$node; + my $type = $node->{type}; + if ($type eq "elt") + { + my $node_opts = $self->get_opts ($node->{name}); + if ($node_opts->{format} eq "verbatim") + { + return 1; + } + if ($node_opts->{format} eq "block") + { + return $node_opts->{normalize} eq "no"; + } + if ($node_opts->{format} eq "inline") + { + return 0; + } + die "LOGIC ERROR: non_normalized_node: unhandled node format.\n"; + } + if ($type eq "comment" || $type eq "pi" || $type eq "DOCTYPE" + || $type eq "CDATA") + { + return 1; + } + if ($type eq "text") + { + die "LOGIC ERROR: non_normalized_node: got called for text node.\n"; + } + die "LOGIC ERROR: non_normalized_node: unhandled node type.\n"; +} + +# ---------------------------------------------------------------------- + +# Format (pretty-print) the document tree + +# Does not modify the node list. + +# The class maintains two variables for storing output: +# - out_doc stores content that has been seen and "flushed". +# - pending stores an array of strings (content of text nodes and inline +# element tags). These are held until they need to be flushed, at +# which point they are concatenated and possibly wrapped/indented. +# Flushing occurs when a break needs to be written, which happens +# when something other than a text node or inline element is seen. + +# If parent name and children are not given, format the entire document. +# Assume prevailing indent = 0 if not given. + +sub tree_format +{ +my $self = shift; +my $par_name = shift || "*DOCUMENT"; # format entire document if no arg +my $children = shift || $self->{tree}; # use entire tree if no arg +my $indent = shift || 0; + + # Formatting options for parent element + my $par_opts = $self->get_opts ($par_name); + + # If parent is a block element: + # - Remember its formatting options on the block stack so they can + # be used to control formatting of inline child elements. + # - Set initial break type to entry-break. + # - Shift prevailing indent right before generating child content. + + if ($par_opts->{format} eq "block") + { + $self->begin_block ($par_name, $par_opts); + $self->set_block_break_type ("entry-break"); + $indent += $par_opts->{"subindent"}; + } + + # Variables for keeping track of whether the previous child + # was a text node. Used for controlling break behavior in + # non-normalized block elements: No line breaks are added around + # text in such elements, nor is indenting added. + + my $prev_child_is_text = 0; + my $cur_child_is_text = 0; + + foreach my $child (@{$children}) + { + $prev_child_is_text = $cur_child_is_text; + + # Text nodes: just add text to pending output + + if ($child->{type} eq "text") + { + $cur_child_is_text = 1; + $self->add_to_pending ($child->{content}); + next; + } + + $cur_child_is_text = 0; + + # Element nodes: handle depending on format type + + if ($child->{type} eq "elt") + { + my $child_opts = $self->get_opts ($child->{name}); + + # Verbatim elements: + # - Print literally without change (use _stringify). + # - Do not line-wrap or add any indent. + + if ($child_opts->{format} eq "verbatim") + { + $self->flush_pending ($indent); + $self->emit_break (0) + unless $prev_child_is_text && !$self->block_normalize (); + $self->set_block_break_type ("element-break"); + $self->add_to_doc ($child->{open_tag} + . $self->tree_stringify ($child->{content}) + . $child->{close_tag}); + next; + } + + # Inline elements: + # - Do not break or indent. + # - Do not line-wrap content; just add content to pending output + # and let it be wrapped as part of parent's content. + + if ($child_opts->{format} eq "inline") + { + $self->add_to_pending ($child->{open_tag}); + $self->tree_format ($child->{name}, $child->{content}, $indent); + $self->add_to_pending ($child->{close_tag}); + next; + } + + # If we get here, node is a block element. + + # - Break and flush any pending output + # - Break and indent (no indent if break count is zero) + # - Process element itself: + # - Put out opening tag + # - Put out element content + # - Put out any indent needed before closing tag. None needed if: + # - Element's exit-break is 0 (closing tag is not on new line, + # so don't indent it) + # - There is no separate closing tag (it was in <abc/> format) + # - Element has no children (tags will be written as + # <abc></abc>, so don't indent closing tag) + # - Element has children, but the block is not normalized and + # the last child is a text node + # - Put out closing tag + + $self->flush_pending ($indent); + $self->emit_break ($indent) + unless $prev_child_is_text && !$self->block_normalize (); + $self->set_block_break_type ("element-break"); + $self->add_to_doc ($child->{open_tag}); + $self->tree_format ($child->{name}, $child->{content}, $indent); + $self->add_to_doc (" " x $indent) + unless $child_opts->{"exit-break"} <= 0 + || $child->{close_tag} eq "" + || !@{$child->{content}} + || (@{$child->{content}} + && $child->{content}->[$#{$child->{content}}]->{type} eq "text" + && $child_opts->{normalize} eq "no"); + $self->add_to_doc ($child->{close_tag}); + next; + } + + # Comments, PIs, etc. (everything other than text and elements), + # treat similarly to verbatim block: + # - Flush any pending output + # - Put out a break + # - Add node content to collected output + + $self->flush_pending ($indent); + $self->emit_break (0) + unless $prev_child_is_text && !$self->block_normalize (); + $self->set_block_break_type ("element-break"); + $self->add_to_doc ($child->{content}); + } + + $prev_child_is_text = $cur_child_is_text; + + # Done processing current element's children now. + + # If current element is a block element: + # - If there were any children, flush any pending output and put + # out the exit break. + # - Pop the block stack + + if ($par_opts->{format} eq "block") + { + if (@{$children}) + { + $self->flush_pending ($indent); + $self->set_block_break_type ("exit-break"); + $self->emit_break (0) + unless $prev_child_is_text && !$self->block_normalize (); + } + $self->end_block (); + } +} + + +# Emit a break - the appropriate number of newlines according to the +# enclosing block's current break type. + +# In addition, emit the number of spaces indicated by indent. (indent +# > 0 when breaking just before emitting an element tag that should +# be indented within its parent element.) + +# Exception: Emit no indent if break count is zero. That indicates +# any following output will be written on the same output line, not +# indented on a new line. + +# Initially, when processing a node's child list, the break type is +# set to entry-break. Each subsequent break is an element-break. +# (After child list has been processed, an exit-break is produced as well.) + +sub emit_break +{ +my ($self, $indent) = @_; + + # number of newlines to emit + my $break_value = $self->block_break_value (); + + $self->add_to_doc ("\n" x $break_value); + # add indent if there *was* a break + $self->add_to_doc (" " x $indent) if $indent > 0 && $break_value > 0; +} + + +# Flush pending output to output document collected thus far: +# - Wrap pending contents as necessary, with indent before *each* line. +# - Add pending text to output document (thus "flushing" it) +# - Clear pending array. + +sub flush_pending +{ +my ($self, $indent) = @_; + + # Do nothing if nothing to flush + return if !@{$self->{pending}}; + + # If current block is not normalized: + # - Text nodes cannot be modified (no wrapping or indent). Flush + # text as is without adding a break or indent. + # If current block is normalized: + # - Add a break. + # - If line wrap is disabled: + # - Add indent if there is a break. (If there isn't a break, text + # should immediately follow preceding tag, so don't add indent.) + # - Add text without wrapping + # - If line wrap is enabled: + # - First line indent is 0 if there is no break. (Text immediately + # follows preceding tag.) Otherwise first line indent is same as + # prevailing indent. + # - Any subsequent lines get the prevailing indent. + + # After flushing text, advance break type to element-break. + + my $s = ""; + + if (!$self->block_normalize ()) + { + $s .= join ("", @{$self->{pending}}); + } + else + { + $self->emit_break (0); + my $wrap_len = $self->block_wrap_length (); + my $break_value = $self->block_break_value (); + if ($wrap_len <= 0) + { + $s .= " " x $indent if $break_value > 0; + $s .= join ("", @{$self->{pending}}); + } + else + { + my $first_indent = ($break_value > 0 ? $indent : 0); + # Wrap lines, then join by newlines (don't add one at end) + my @lines = $self->line_wrap ($self->{pending}, + $first_indent, + $indent, + $wrap_len); + $s .= join ("\n", @lines); + } + } + + $self->add_to_doc ($s); + $self->{pending} = [ ]; + $self->set_block_break_type ("element-break"); +} + + +# Perform line-wrapping of string array to lines no longer than given +# length (including indent). +# Any word longer than line length appears by itself on line. +# Return array of lines (not newline-terminated). + +# $strs - reference to array of text items to be joined and line-wrapped. +# Each item may be: +# - A tag (such as <emphasis role="bold">). This should be treated as +# an atomic unit, which is important for preserving inline tags intact. +# - A possibly multi-word string (such as "This is a string"). In this +# latter case, line-wrapping preserves internal whitespace in the +# string, with the exception that if whitespace would be placed at +# the end of a line, it is discarded. + +# $first_indent - indent for first line +# $rest_indent - indent for any remaining lines +# $max_len - maximum length of output lines (including indent) + +sub line_wrap +{ +my ($self, $strs, $first_indent, $rest_indent, $max_len) = @_; + + # First, tokenize the strings + + my @words = (); + foreach my $str (@{$strs}) + { + if ($str =~ /^</) + { + # String is a tag; treat as atomic unit and don't split + push (@words, $str); + } + else + { + # String of white and non-white tokens. + # Tokenize into white and non-white tokens. + push (@words, ($str =~ /\S+|\s+/g)); + } + } + + # Now merge tokens that are not separated by whitespace tokens. For + # example, "<i>", "word", "</i>" gets merged to "<i>word</i>". But + # "<i>", " ", "word", " ", "</i>" gets left as separate tokens. + + my @words2 = (); + foreach my $word (@words) + { + # If there is a previous word that does not end with whitespace, + # and the currrent word does not begin with whitespace, concatenate + # current word to previous word. Otherwise append current word to + # end of list of words. + if (@words2 && $words2[$#words2] !~ /\s$/ && $word !~ /^\s/) + { + $words2[$#words2] .= $word; + } + else + { + push (@words2, $word); + } + } + + my @lines = (); + my $line = ""; + my $llen = 0; + # set the indent for the first line + my $indent = $first_indent; + # saved-up whitespace to put before next non-white word + my $white = ""; + + foreach my $word (@words2) # ... while words remain to wrap + { + # If word is whitespace, save it. It gets added before next + # word if no line-break occurs. + if ($word =~ /^\s/) + { + $white .= $word; + next; + } + my $wlen = length ($word); + if ($llen == 0) + { + # New output line; it gets at least one word (discard any + # saved whitespace) + $line = " " x $indent . $word; + $llen = $indent + $wlen; + $indent = $rest_indent; + $white = ""; + next; + } + if ($llen + length ($white) + $wlen > $max_len) + { + # Word (plus saved whitespace) won't fit on current line. + # Begin new line (discard any saved whitespace). + push (@lines, $line); + $line = " " x $indent . $word; + $llen = $indent + $wlen; + $indent = $rest_indent; + $white = ""; + next; + } + # add word to current line with saved whitespace between + $line .= $white . $word; + $llen += length ($white) + $wlen; + $white = ""; + } + + # push remaining line, if any + push (@lines, $line) if $line ne ""; + + return @lines; +} + +1; + +# ---------------------------------------------------------------------- + +# Begin main program + +package main; + + +my $usage = <<EOF; +Usage: $PROG_NAME [options] xml-file + +Options: +--help, -h + Print this message and exit. +--backup suffix -b suffix + Back up the input document, adding suffix to the input + filename to create the backup filename. +--canonized-output + Proceed only as far as the document canonization stage, + printing the result. +--check-parser + Parse the document into tokens and verify that their + concatenation is identical to the original input document. + This option suppresses further document processing. +--config-file file_name, -f file_name + Specify the configuration filename. If no file is named, + xmlformat uses the file named by the environment variable + XMLFORMAT_CONF, if it exists, or ./xmlformat.conf, if it + exists. Otherwise, xmlformat uses built-in formatting + options. +--in-place, -i + Format the document in place, replacing the contents of + the input file with the reformatted document. (It's a + good idea to use --backup along with this option.) +--show-config + Show configuration options after reading configuration + file. This option suppresses document processing. +--show-unconfigured-elements + Show elements that are used in the document but for + which no options were specified in the configuration + file. This option suppresses document output. +--verbose, -v + Be verbose about processing stages. +--version, -V + Show version information and exit. +EOF + +# Variables for command line options; most are undefined initially. +my $help; +my $backup_suffix; +my $conf_file; +my $canonize_only; +my $check_parser; +my $in_place; +my $show_conf; +my $show_unconf_elts; +my $show_version; +my $verbose; + +GetOptions ( + # =i means an integer argument is required after the option + # =s means a string argument is required after the option + # :s means a string argument is optional after the option + "help|h" => \$help, # print help message + "backup|b=s" => \$backup_suffix, # make backup using suffix + "canonized-output" => \$canonize_only, # print canonized document + "check-parser" => \$check_parser, # verify parser integrity + "config-file|f=s" => \$conf_file, # config file + "in-place|i" => \$in_place, # format in place + "show-config" => \$show_conf, # show configuration file + # need better name + "show-unconfigured-elements" => \$show_unconf_elts, # show unconfigured elements + "verbose|v" => \$verbose, # be verbose + "version|V" => \$show_version, # show version info +) or do { print "$usage\n"; exit (1); }; + +if (defined ($help)) +{ + print "$usage\n"; + exit (0); +} + +if (defined ($show_version)) +{ + print "$PROG_NAME $PROG_VERSION ($PROG_LANG version)\n"; + exit (0); +} + +# --in-place option requires a named file + +warn "WARNING: --in-place/-i option ignored (requires named input files)\n" + if defined ($in_place) && @ARGV == 0; + +# --backup/-b is meaningless without --in-place + +if (defined ($backup_suffix)) +{ + if (!defined ($in_place)) + { + die "--backup/-b option meaningless without --in-place/-i option\n"; + } +} + +# Save input filenames +my @in_file = @ARGV; + +my $xf = XMLFormat->new (); + +# If a configuration file was named explicitly, use it. An error occurs +# if the file does not exist. + +# If no configuration file was named, fall back to: +# - The file named by the environment variable XMLFORMAT_CONF, if it exists +# - ./xmlformat.conf, if it exists + +# If no configuration file can be found at all, the built-in default options +# are used. (These are set up in new().) + +my $env_conf_file = $ENV{XMLFORMAT_CONF}; +my $def_conf_file = "./xmlformat.conf"; + +# If no config file was named, but XMLFORMAT_CONF is set, use its value +# as the config file name. +if (!defined ($conf_file)) +{ + $conf_file = $env_conf_file if defined ($env_conf_file); +} +# If config file still isn't defined, use the default file if it exists. +if (!defined ($conf_file)) +{ + if (-r $def_conf_file && ! -d $def_conf_file) + { + $conf_file = $def_conf_file; + } +} +if (defined ($conf_file)) +{ + warn "Reading configuration file...\n" if $verbose; + die "Configuration file '$conf_file' is not readable.\n" if ! -r $conf_file; + die "Configuration file '$conf_file' is a directory.\n" if -d $conf_file; + $xf->read_config ($conf_file) +} + +if ($show_conf) # show configuration and exit +{ + $xf->display_config (); + exit(0); +} + +my ($in_doc, $out_doc); + +# Process arguments. +# - If no files named, read string, write to stdout. +# - If files named, read and process each one. Write output to stdout +# unless --in-place option was given. Make backup of original file +# if --backup option was given. + +if (@ARGV == 0) +{ + warn "Reading document...\n" if $verbose; + { + local $/ = undef; + $in_doc = <>; # slurp input document as single string + } + + $out_doc = $xf->process_doc ($in_doc, + $verbose, $check_parser, $canonize_only, $show_unconf_elts); + if (defined ($out_doc)) + { + warn "Writing output document...\n" if $verbose; + print $out_doc; + } +} +else +{ + foreach my $file (@ARGV) + { + warn "Reading document $file...\n" if $verbose; + open (IN, $file) + or die "Cannot read $file: $!\n"; + { + local $/ = undef; + $in_doc = <IN>; # slurp input document as single string + } + close (IN); + $out_doc = $xf->process_doc ($in_doc, + $verbose, $check_parser, $canonize_only, $show_unconf_elts); + next unless defined ($out_doc); + if (defined ($in_place)) + { + if (defined ($backup_suffix)) + { + warn "Making backup of $file to $file$backup_suffix...\n" if $verbose; + rename ($file, $file . $backup_suffix) + or die "Could not rename $file to $file$backup_suffix: $!\n"; + } + warn "Writing output document to $file...\n" if $verbose; + open (OUT, ">$file") or die "Cannot write to $file: $!\n"; + print OUT $out_doc; + close (OUT); + } + else + { + warn "Writing output document...\n" if $verbose; + print $out_doc; + } + } +} + +warn "Done!\n" if $verbose; + +exit (0); |