#! /usr/bin/perl -w
#
# Lovins-stemmer.pl Version 1.1
#
# Copyright 2001 Gordon W. Paynter (paynter@cs.waikato.ac.nz).
# Distributed under the GNU General Public License (Version 2).
#
# An imlpementation of the Lovins' stemmer.
# It's not very efficient, but it works.
#
# The algorithm is described in "Development of a Stemming Algorithm"
# by Julie Beth Lovins (published in "Mechanical translation and
# computational linguistics", 11:22-31, 1968)


while (<>) {
    print &LovinsStemmer::stemline($_);
}


package LovinsStemmer;

BEGIN {
    # The "List of endings" describedin Lovins' paper.
    %ending_list = (
		    'a' => A,

		    'ae' => A,
		    'al' => BB,
		    'ar' => X,
		    'as' => B,

		    'acy' => A,
		    'age' => B,
		    'aic' => A,
		    'als' => BB,
		    'ant' => B,
		    'ars' => O,
		    'ary' => F,
		    'ata' => A,
		    'ate' => A,

		    'able' => A,
		    'ably' => A,
		    'ages' => B,
		    'ally' => B,
		    'ance' => B,
		    'ancy' => B,
		    'ants' => B,
		    'aric' => A,
		    'arly' => K,
		    'ated' => I,
		    'ates' => A,
		    'atic' => B,
		    'ator' => A,

		    'acies' => A,
		    'acity' => A,
		    'aging' => B,
		    'aical' => A,
		    'alist' => A,
		    'alism' => B,
		    'ality' => A,
		    'alize' => A,
		    'allic' => BB,
		    'anced' => B,
		    'ances' => B,
		    'antic' => C,
		    'arial' => A,
		    'aries' => A,
		    'arily' => A,
		    'arity' => B,
		    'arize' => A,
		    'aroid' => A,
		    'ately' => A,
		    'ating' => I,
		    'ation' => B,
		    'ative' => A,
		    'ators' => A,
		    'atory' => A,
		    'ature' => E,

		    'aceous' => A,
		    'acious' => B,
		    'action' => G,
		    'alness' => A,
		    'ancial' => A,
		    'ancies' => A,
		    'ancing' => B,
		    'ariser' => A,
		    'arized' => A,
		    'arizer' => A,
		    'atable' => A,
		    'ations' => B,
		    'atives' => A,
		    
		    'ability' => A,
		    'aically' => A,
		    'alistic' => B,
		    'alities' => A,
		    'ariness' => E,
		    'aristic' => A,
		    'arizing' => A,
		    'ateness' => A,
		    'atingly' => A,
		    'ational' => B,
		    'atively' => A,
		    'ativism' => A,
		    
		    'ableness' => A,
		    'arizable' => A,
		    
		    'allically' => C,
		    'antaneous' => A,
		    'antiality' => A,
		    'arisation' => A,
		    'arization' => A,
		    'ationally' => B,
		    'ativeness' => A,
		    
		    'antialness' => A,
		    'arisations' => A,
		    'arizations' => A,

		    'alistically' => B,
		    'arizability' => A,

		    'e' => A,
		    
		    'ed' => E,
		    'en' => F,
		    'es' => E,
		    
		    'eal' => Y,
		    'ear' => Y,
		    'ely' => E,
		    'ene' => E,
		    'ent' => C,
		    'ery' => E,
		    'ese' => A,
		    
		    'ealy' => Y,
		    'edly' => E,
		    'eful' => A,
		    'eity' => A,
		    'ence' => A,
		    'ency' => A,
		    'ened' => E,
		    'enly' => E,
		    'eous' => A,
		    
		    'early' => Y,
		    'ehood' => A,
		    'eless' => A,
		    'elily' => A,
		    'ement' => A,
		    'enced' => A,
		    'ences' => A,
		    'eness' => E,
		    'ening' => E,
		    'ental' => A,
		    'ented' => C,
		    'ently' => A,
		    
		    'eature' => Z,
		    'efully' => A,
		    'encies' => A,
		    'encing' => A,
		    'ential' => A,
		    'enting' => C,
		    'entist' => A,
		    'eously' => A,

		    'elihood' => E,
		    'encible' => A,
		    'entally' => A,
		    'entials' => A,
		    'entiate' => A,
		    'entness' => A,

		    'entation' => A,
		    'entially' => A,
		    'eousness' => A,

		    'eableness' => E,
		    'entations' => A,
		    'entiality' => A,
		    'entialize' => A,
		    'entiation' => A,

		    'entialness' => A,

		    'ful' => A,

		    'fully' => A,

		    'fulness' => A,

		    'hood' => A,

		    'i' => A,

		    'ia' => A,
		    'ic' => A,
		    'is' => A,

		    'ial' => A,
		    'ian' => A,
		    'ics' => A,
		    'ide' => L,
		    'ied' => A,
		    'ier' => A,
		    'ies' => P,
		    'ily' => A,
		    'ine' => M,
		    'ing' => N,
		    'ion' => Q,
		    'ish' => C,
		    'ism' => B,
		    'ist' => A,
		    'ite' => AA,
		    'ity' => A,
		    'ium' => A,
		    'ive' => A,
		    'ize' => F,

		    'ials' => A,
		    'ians' => A,
		    'ible' => A,
		    'ibly' => A,
		    'ical' => A,
		    'ides' => L,
		    'iers' => A,
		    'iful' => A,
		    'ines' => M,
		    'ings' => N,
		    'ions' => B,
		    'ious' => A,
		    'isms' => B,
		    'ists' => A,
		    'itic' => H,
		    'ized' => F,
		    'izer' => F,

		    'ially' => A,
		    'icant' => A,
		    'ician' => A,
		    'icide' => A,
		    'icism' => A,
		    'icist' => A,
		    'icity' => A,
		    'idine' => I,
		    'iedly' => A,
		    'ihood' => A,
		    'inate' => A,
		    'iness' => A,
		    'ingly' => B,
		    'inism' => J,
		    'inity' => CC,
		    'ional' => A,
		    'ioned' => A,
		    'ished' => A,
		    'istic' => A,
		    'ities' => A,
		    'itous' => A,
		    'ively' => A,
		    'ivity' => A,
		    'izers' => F,
		    'izing' => F,

		    'ialist' => A,
		    'iality' => A,
		    'ialize' => A,
		    'ically' => A,
		    'icance' => A,
		    'icians' => A,
		    'icists' => A,
		    'ifully' => A,
		    'ionals' => A,
		    'ionate' => D,
		    'ioning' => A,
		    'ionist' => A,
		    'iously' => A,
		    'istics' => A,
		    'izable' => E,

		    'ibility' => A,
		    'icalism' => A,
		    'icalist' => A,
		    'icality' => A,
		    'icalize' => A,
		    'ication' => G,
		    'icianry' => A,
		    'ination' => A,
		    'ingness' => A,
		    'ionally' => A,
		    'isation' => A,
		    'ishness' => A,
		    'istical' => A,
		    'iteness' => A,
		    'iveness' => A,
		    'ivistic' => A,
		    'ivities' => A,
		    'ization' => F,
		    'izement' => A,

		    'ibleness' => A,
		    'icalness' => A,
		    'ionalism' => A,
		    'ionality' => A,
		    'ionalize' => A,
		    'iousness' => A,
		    'izations' => A,

		    'ionalness' => A,
		    'istically' => A,
		    'itousness' => A,
		    'izability' => A,
		    'izational' => A,

		    'izationally' => B,

		    'ly' => B,

		    'less' => A,
		    'lily' => A,

		    'lessly' => A,

		    'lessness' => A,

		    'ness' => 'A',

		    'nesses' => A,

		    'o' => A,

		    'on' => S,
		    'or' => T,

		    'oid' => A,
		    'one' => R,
		    'ous' => A,

		    'ogen' => A,

		    'oidal' => A,
		    'oides' => A,
		    'otide' => A,
		    'ously' => A,

		    'oidism' => A,

		    'oidally' => A,
		    'ousness' => A,

		    's' => W,

		    "s'" => A,

		    'um' => U,
		    'us' => V,

		    'ward' => A,
		    'wise' => A,

		    'y' => B,

		    'yl' => R,

		    'ying' => B,
		    'yish' => A,

		    "'s" => A,
		    );
    
}



# Stem any given string.

sub stemline {
    my ($line) = @_;
    my ($word, $rest);
    my $result = '';

    $line = lc;
    while($line) {
	($word, $rest) = $line =~ /^([a-z]*)(.*)/s;
	if ($word) {
	    $result .= &stemword($word);
	} else {
	    ($word, $rest) = $line =~ /^([^a-z]*)(.*)/s;
	    $result .= $word;
	}
	$line = $rest;
    }
    return $result;
}


# Stem a lowercase word

sub stemword {
    my $word = shift @_;

    # the word length
    my $wlen = length $word;
    return $word unless ($wlen > 2);

    # Phase 1: search for a match between the phrase and the list of endings
    my ($prefix, $suffix, $prelen, $sufflen);

    # The longest suffix is 11 characters long
    if ($wlen <= 13) {
	$prelen = 2;
	$prefix = substr $word, 0, 2;
	$suffix = substr $word, 2;
	$suflen = length $suffix;
    } else {
	$prelen = $wlen - 11;
	$prefix = substr $word, 0, $prelen;
	$suffix = substr $word, $prelen;
	$suflen = length $suffix;
    }

    my ($condition_code, $stem);
    for (;;) {
	if ($condition_code = $ending_list{$suffix}) {
	    # Stem if context-sensitive rules are satisfied
	    if (($condition_code eq 'A')  
		# I've sorted these codes in the order of the frequency of their
	        # in the brown corpus (more or less).
		|| (($condition_code eq 'B') && ($prelen >= 3))
		|| (($condition_code eq 'W') && ($prefix !~ /[su]$/o))
		|| (($condition_code eq 'E') && ($prefix !~ /e$/o))

		|| (($condition_code eq 'N') && ($prefix =~ /([^s]..|.s..)$/o))
		|| (($condition_code eq 'F') && ($prelen >= 3) && ($prefix !~ /e$/o))

		|| (($condition_code eq 'Q') && ($prelen >= 3) && ($prefix !~ /[ln]$/o))
		|| (($condition_code eq 'C') && ($prelen >= 4))
		|| (($condition_code eq 'BB') && ($prelen >= 3) && ($prefix !~ /(met|ryst)$/o))

		|| (($condition_code eq 'S') && ($prefix =~ /(dr|[^t]t)$/o))
		|| (($condition_code eq 'T') && ($prefix =~ /(s|[^o]t)$/o))
		|| (($condition_code eq 'X') && ($prefix =~ /(l|i|u.e)$/o))
		|| (($condition_code eq 'I') && ($prefix !~ /[oe]$/o))
		|| (($condition_code eq 'P') && ($prefix !~ /c$/o))

		|| (($condition_code eq 'M') && ($prefix !~ /[aecm]$/o))
		|| (($condition_code eq 'L') && ($prefix !~ /(u|x|[^o]s)$/o))
		|| (($condition_code eq 'O') && ($prefix =~ /[li]$/o))
		|| (($condition_code eq 'AA') && ($prefix =~ /(d|f|ph|th|l|er|or|es|t)$/o))
		|| (($condition_code eq 'V') && ($prefix =~ /c$/o))

		|| (($condition_code eq 'R') && ($prefix =~ /[nr]$/o))
		|| (($condition_code eq 'Y') && ($prefix =~ /in$/o))
		|| (($condition_code eq 'G') && ($prelen >= 3) && ($prefix =~ /f$/o))
		|| (($condition_code eq 'K') && ($prelen >= 3) && ($prefix =~ /(l|i|u.e)$/o))
		|| (($condition_code eq 'U') && ($prefix =~ /[lmnr]$/o))
		|| (($condition_code eq 'D') && ($prelen >= 5))
		|| (($condition_code eq 'H') && ($prefix =~ /(t|ll)$/o))
		|| (($condition_code eq 'J') && ($prefix !~ /[ae]$/o))
		|| (($condition_code eq 'Z') && ($prefix !~ /f$/o))
		|| (($condition_code eq 'CC') && ($prefix =~ /l$/o)) ) {
		
		$word = $prefix;
		$wlen = $prelen;
		last;
	    }
	}
	
	# Try the next shorter suffix
	$prefix .= substr $suffix, 0, 1;
	$prelen++;
	$suffix = substr $suffix, 1;
	$suflen--;
	last if ($suflen == 0);

       
    }

    # Phase 2: recoding suffixes
    my $last1 = substr $word, ($wlen - 1);
    if ($last1 eq 't') {
	$word =~ s/tt$/t/o;
	$word =~ s/uct$/uc/o;
	$word =~ s/umpt$/um/o;
	$word =~ s/rpt$/rb/o;
	$word =~ s/mit$/mis/o;
	$word =~ s/ert$/ers/o;
	$word =~ s/^et$/es/o;
	$word =~ s/([^n])et$/$1es/o;
        $word =~ s/yt$/ys/o;
    } elsif ($last1 eq 'r') {
	$word =~ s/rr$/r/o;
	$word =~ s/istr$/ister/o;
	$word =~ s/metr$/meter/o;
	$word =~ s/^her$/hes/o;
	$word =~ s/([^pt])her$/$1hes/o;
    } elsif ($last1 eq 'd') {
	$word =~ s/dd$/d/o;
	$word =~ s/uad$/uas/o;
	$word =~ s/vad$/vas/o;
	$word =~ s/cid$/cis/o;
	$word =~ s/lid$/lis/o;
	$word =~ s/erid$/eris/o;
	$word =~ s/pand$/pans/o;
	$word =~ s/^end$/ens/o;
	$word =~ s/([^s])end$/$1ens/o;
	$word =~ s/ond$/ons/o;
	$word =~ s/lud$/lus/o;
	$word =~ s/rud$/rus/o;
	$word =~ s/^end$/ens/o;
	$word =~ s/([^m])end$/$1ens/o;
    } elsif ($last1 eq 'n') {
	$word =~ s/nn$/n/o;
    } elsif ($last1 eq 'l') {
	$word =~ s/ll$/l/o;
	$word =~ s/^ul$/$1l/o;
	$word =~ s/([^aio])ul$/$1l/o;
    } elsif ($last1 eq 'm') {
	$word =~ s/mm$/m/o;
    } elsif ($last1 eq 's') {
	$word =~ s/ss$/s/o;
	$word =~ s/urs$/ur/o;
    } elsif ($last1 eq 'g') {
	$word =~ s/gg$/g/o;
    } elsif ($last1 eq 'v') {
	$word =~ s/iev$/ief/o;
	$word =~ s/olv$/olut/o;
    } elsif ($last1 eq 'p') {
	$word =~ s/pp$/p/o;
    } elsif ($last1 eq 'b') {
	$word =~ s/bb$/b/o;
    } elsif ($last1 eq 'x') {
	$word =~ s/bex$/bic/o;
	$word =~ s/dex$/dic/o;
	$word =~ s/pex$/pic/o;
	$word =~ s/tex$/tic/o;
	$word =~ s/ax$/ac/o;
	$word =~ s/ex$/ec/o;
	$word =~ s/ix$/ic/o;
	$word =~ s/lux$/luc/o;
    } elsif ($last1 eq 'z') {
	$word =~ s/yz$/ys/o;
    }

    # output
    return $word;
}

