MOON

File: //usr/local/lib/perl5/site_perl/5.8.8/Text/Query/ParseAdvanced.pm
#
#   Copyright (C) 1999 Eric Bohlman, Loic Dachary
#
#   This program is free software; you can redistribute it and/or modify it
#   under the terms of the GNU General Public License as published by the
#   Free Software Foundation; either version 2, or (at your option) any
#   later version.  You may also use, redistribute and/or modify it
#   under the terms of the Artistic License supplied with your Perl
#   distribution
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
#
# 
# $Header: /usr/local/cvsroot/Text-Query/lib/Text/Query/ParseAdvanced.pm,v 1.2 1999/06/14 12:53:58 loic Exp $
#
package Text::Query::ParseAdvanced;

use strict;

use Carp;
use Text::Query::Parse;

use vars qw(@ISA);

@ISA = qw(Text::Query::Parse);

sub prepare {
    my($self) = shift;
    my($qstring) = shift;
    my(%args) = @_;

    my $default_operators = {
	'or' => 'or',
	'and' => 'and',
	'near' => 'near',
	'not' => 'not',
    };

    $self->{'scope_map'} = $args{-scope_map} || {};

    return $self->SUPER::prepare($qstring, -near=>10, -operators=>$default_operators, @_);
}

sub expression($) {
    my($self) = shift;
    my($rv, $t);
    my($or) = $self->{parseopts}{-operators}{or};
    my($tokens) = $self->{'tokens'};
    $self->{'token'} = shift(@$tokens);
    $rv = $self->conj();
    while(defined($self->{'token'}) and $self->{'token'} =~ /^($or|\|)$/i) {
	$self->{'token'} = shift(@{$self->{'tokens'}});
	$t= $self->conj();
	$rv = $self->build_expression($rv,$t);
    }
    return $self->build_expression_finish($rv); 
}

sub conj($) {
    my($self) = shift;
    my($rv);
    my($first) = 1;
    my($and) = $self->{parseopts}{-operators}{and};
    $rv = $self->concat();
    while(defined($self->{'token'}) and $self->{'token'} =~ /^($and|&)$/i) {
	$self->{'token'} = shift(@{$self->{'tokens'}});
	$rv = $self->build_conj($rv, concat($self), $first);
	$first=0;
    }
    return $rv;
}

sub concat($) {
    my($self) = shift;
    my($rv,$t,$l);
    my($not) = $self->{parseopts}{-operators}{not};
    my($near) = $self->{parseopts}{-operators}{near};
    $rv = factor($self);
    while(defined($self->{'token'}) and ($l = $self->{'token'}) =~ /^\e|([\(!\~]|$not|$near)$/i) {
	$self->{'token'} = shift(@{$self->{'tokens'}}) if($l =~ /^($near|\~)$/i);
	$t = factor($self);
	if($l =~ /^($near|\~)$/i) {
	    $rv = $self->build_near($rv, $t);
	} else {
	    $rv = $self->build_concat($rv, $t);
	}
    }
    return $rv;
}

sub factor($) {
    my($self) = shift;

    my($rv,$t);
    my($not) = $self->{parseopts}{-operators}{not};
    if(!defined($t = $self->{'token'})) {
	croak("out of token in factor");
    } elsif($t eq '(') {
	$rv = $self->expression();
	if(defined($self->{'token'}) and $self->{'token'} eq ')') {
	    $self->{'token'} = shift(@{$self->{'tokens'}});
	} else {
	    croak("missing closing parenthesis in factor");
	}
    } elsif($t =~ /^($not|!)$/i) {
	$self->{'token'} = shift(@{$self->{'tokens'}});
	$rv = $self->build_negation(factor($self));
    } elsif($t =~ s/^\e//) {
	$rv = $self->build_literal($t);
	$self->{'token'} = shift(@{$self->{'tokens'}});
    } elsif($t =~ s/:$//) {
	$self->{'token'} = shift(@{$self->{'tokens'}});
	unshift(@{$self->{'scope'}}, ($self->{'scope_map'}{$t} || $t));
	$self->build_scope_start();
	$rv = $self->build_scope_end(factor($self));
	shift(@{$self->{'scope'}});
    } else {
	croak("unexpected token $t in factor");
    }
    return $rv;
}

sub parse_tokens {
    local($^W) = 0;
    my($self) = shift;
    my($line) = @_;
    my($quote, $quoted, $unquoted, $delim, $word);
    my($quotes) = $self->{parseopts}{-quotes};
    my($operators) = join("|", values(%{$self->{parseopts}{-operators}}));
    my(@tokens) = ();

    warn("quotes = $quotes") if($self->{-verbose} > 1);
    while(length($line)) {
	($quote, $quoted, undef, $unquoted, $delim, undef) =
	    $line =~ m/^([$quotes])                 # a $quote
                ((?:\\.|(?!\1)[^\\])*)    # and $quoted text
                \1 		       # followed by the same quote
                ([\000-\377]*)	       # and the rest
	       |                       # --OR--
                ^((?:\\.|[^\\$quotes])*?)    # an $unquoted text
	        (\Z(?!\n)|(?:\s*([()|&!\~]|\b(?:$operators)\b|\b(?:[-,_\.\w]+\:))\s*)|(?!^)(?=[$quotes])) # plus EOL, delimiter, or quote
                ([\000-\377]*)	       # the rest
	       /ix;		       # extended layout

	warn("quote = $quote") if($self->{-verbose} > 1 && $quote);
	last unless($quote || length($unquoted) || length($delim));
	$line = $+;
	$unquoted =~ s/^\s+//;
	$unquoted =~ s/\s+$//;
	$word .= defined($quote) ? $quoted : $unquoted;
	warn("word = $word") if($self->{-verbose} > 1 and (length($word) and (length($delim) or !length($line))));
	push(@tokens,"\e$word") if(length($word) and (length($delim) or !length($line)));
	$delim =~ s/^\s+//;
	$delim =~ s/\s+$//;
	warn("delim = $word") if($self->{-verbose} > 1 and length($delim));
	push(@tokens, $delim) if(length($delim));
	undef $word if(length($delim));
    }

    warn("parsed tokens @tokens") if($self->{-verbose} > 1);

    $self->{'tokens'} = \@tokens;
}

1;

__END__

=head1 NAME

Text::Query::ParseAdvanced - Parse AltaVista advanced query syntax

=head1 SYNOPSIS

  use Text::Query;
  my $q=new Text::Query('hello and world',
                        -parse => 'Text::Query::ParseAdvanced',
                        -solve => 'Text::Query::SolveAdvancedString',
                        -build => 'Text::Query::BuildAdvancedString');


=head1 DESCRIPTION

This module provides an object that parses a string  
containing a Boolean query expression similar to an AltaVista "advanced 
query".

It's base class is Text::Query::Parse;

Query expressions consist of literal strings (or regexps) joined by the 
following operators, in order of precedence from lowest to highest:

=over 4

=item OR, |

=item AND, &

=item NEAR, ~

=item NOT, !

=back

Operator names are not case-sensitive.  Note that if you want to use a C<|> 
in a regexp, you need to backwhack it to keep it from being seen as a query 
operator.  Sub-expressions may be quoted in single or double quotes to 
match "and," "or," or "not" literally and may be grouped in parentheses 
(C<(, )>) to alter the precedence of evaluation.

A parenthesized sub-expression may also be concatenated with other sub- 
expressions to match sequences: C<(Perl or Python) interpreter> would match 
either "Perl interpreter" or "Python interpreter".  Concatenation has a 
precedence higher than NOT but lower than AND.  Juxtaposition of 
simple words has the highest precedence of all.

=head1 OPTIONS

These are the additional options of the C<prepare> method and the constructor.

=over 4

=item -near defaults to 10

Sets the number of words that can occur between two expressions 
and still satisfy the NEAR operator.

=item -operators defaults to and, or, not, near

Sets the operator names. The argument of the option is a pointer to a
hash table mapping the default names to desired names. For instance:

    {
	'or' => 'ou',
	'and' => 'et',
	'near' => 'proche',
	'not' => 'non',
    }

=item -scope_map default to {}

Map the scope names to other names. If a scope is specified as C<scope:>
search the map for an entry whose key is C<scope> and replace C<scope> with
the scalar found. For instance:

     {
	 'scope' => 'otherscope'
     }

=back

=head1 SEE ALSO

Text::Query(3)
Text::Query::Parse(3)

=head1 AUTHORS

Eric Bohlman (ebohlman@netcom.com)

Loic Dachary (loic@senga.org)

=cut

# Local Variables: ***
# mode: perl ***
# End: ***