New test/compiler/c2/stemmer/Stemmer.java

   1 /**
   2  * @test
   3  * @bug 7070134
   4  * @summary Hotspot crashes with sigsegv from PorterStemmer
   5  * @modules java.base/jdk.internal.misc
   6  * @library /test/lib
   7  *
   8  * @run driver jdk.test.lib.FileInstaller words words
   9  * @run main/othervm -Xbatch compiler.c2.stemmer.Stemmer words
  10  */
  11 
  12 /*
  13 
  14    Porter stemmer in Java. The original paper is in
  15 
  16        Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  17        no. 3, pp 130-137,
  18 
  19    http://www.tartarus.org/~martin/PorterStemmer
  20 
  21    The software is completely free for any purpose, unless notes at the head
  22    of the program text indicates otherwise (which is rare). In any case,
  23    the notes about licensing are never more restrictive than the BSD License.
  24 
  25    In every case where the software is not written by me (Martin Porter),
  26    this licensing arrangement has been endorsed by the contributor, and it is
  27    therefore unnecessary to ask the contributor again to confirm it.
  28 
  29    I have not asked any contributors (or their employers, if they have them)
  30    for proofs that they have the right to distribute their software in this way.
  31 
  32    History:
  33 
  34    Release 1
  35 
  36    Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
  37    The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
  38    is then out outside the bounds of b.
  39 
  40    Release 2
  41 
  42    Similarly,
  43 
  44    Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
  45    'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
  46    b[j] is then outside the bounds of b.
  47 
  48    Release 3
  49 
  50    Considerably revised 4/9/00 in the light of many helpful suggestions
  51    from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
  52 
  53    Release 4
  54 
  55 */
  56 
  57 package compiler.c2.stemmer;
  58 
  59 import java.io.FileInputStream;
  60 import java.io.FileNotFoundException;
  61 import java.io.IOException;
  62 
  63 /**
  64   * Stemmer, implementing the Porter Stemming Algorithm
  65   *
  66   * The Stemmer class transforms a word into its root form.  The input
  67   * word can be provided a character at time (by calling add()), or at once
  68   * by calling one of the various stem(something) methods.
  69   */
  70 
  71 public class Stemmer
  72 {  private char[] b;
  73    private int i,     /* offset into b */
  74                i_end, /* offset to end of stemmed word */
  75                j, k;
  76    private static final int INC = 50;
  77                      /* unit of size whereby b is increased */
  78    public Stemmer()
  79    {  b = new char[INC];
  80       i = 0;
  81       i_end = 0;
  82    }
  83 
  84    /**
  85     * Add a character to the word being stemmed.  When you are finished
  86     * adding characters, you can call stem(void) to stem the word.
  87     */
  88 
  89    public void add(char ch)
  90    {  if (i == b.length)
  91       {  char[] new_b = new char[i+INC];
  92          for (int c = 0; c < i; c++) new_b[c] = b[c];
  93          b = new_b;
  94       }
  95       b[i++] = ch;
  96    }
  97 
  98 
  99    /** Adds wLen characters to the word being stemmed contained in a portion
 100     * of a char[] array. This is like repeated calls of add(char ch), but
 101     * faster.
 102     */
 103 
 104    public void add(char[] w, int wLen)
 105    {  if (i+wLen >= b.length)
 106       {  char[] new_b = new char[i+wLen+INC];
 107          for (int c = 0; c < i; c++) new_b[c] = b[c];
 108          b = new_b;
 109       }
 110       for (int c = 0; c < wLen; c++) b[i++] = w[c];
 111    }
 112 
 113    /**
 114     * After a word has been stemmed, it can be retrieved by toString(),
 115     * or a reference to the internal buffer can be retrieved by getResultBuffer
 116     * and getResultLength (which is generally more efficient.)
 117     */
 118    public String toString() { return new String(b,0,i_end); }
 119 
 120    /**
 121     * Returns the length of the word resulting from the stemming process.
 122     */
 123    public int getResultLength() { return i_end; }
 124 
 125    /**
 126     * Returns a reference to a character buffer containing the results of
 127     * the stemming process.  You also need to consult getResultLength()
 128     * to determine the length of the result.
 129     */
 130    public char[] getResultBuffer() { return b; }
 131 
 132    /* cons(i) is true <=> b[i] is a consonant. */
 133 
 134    private final boolean cons(int i)
 135    {  switch (b[i])
 136       {  case 'a': case 'e': case 'i': case 'o': case 'u': return false;
 137          case 'y': return (i==0) ? true : !cons(i-1);
 138          default: return true;
 139       }
 140    }
 141 
 142    /* m() measures the number of consonant sequences between 0 and j. if c is
 143       a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
 144       presence,
 145 
 146          <c><v>       gives 0
 147          <c>vc<v>     gives 1
 148          <c>vcvc<v>   gives 2
 149          <c>vcvcvc<v> gives 3
 150          ....
 151    */
 152 
 153    private final int m()
 154    {  int n = 0;
 155       int i = 0;
 156       while(true)
 157       {  if (i > j) return n;
 158          if (! cons(i)) break; i++;
 159       }
 160       i++;
 161       while(true)
 162       {  while(true)
 163          {  if (i > j) return n;
 164                if (cons(i)) break;
 165                i++;
 166          }
 167          i++;
 168          n++;
 169          while(true)
 170          {  if (i > j) return n;
 171             if (! cons(i)) break;
 172             i++;
 173          }
 174          i++;
 175        }
 176    }
 177 
 178    /* vowelinstem() is true <=> 0,...j contains a vowel */
 179 
 180    private final boolean vowelinstem()
 181    {  int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
 182       return false;
 183    }
 184 
 185    /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
 186 
 187    private final boolean doublec(int j)
 188    {  if (j < 1) return false;
 189       if (b[j] != b[j-1]) return false;
 190       return cons(j);
 191    }
 192 
 193    /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
 194       and also if the second c is not w,x or y. this is used when trying to
 195       restore an e at the end of a short word. e.g.
 196 
 197          cav(e), lov(e), hop(e), crim(e), but
 198          snow, box, tray.
 199 
 200    */
 201 
 202    private final boolean cvc(int i)
 203    {  if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
 204       {  int ch = b[i];
 205          if (ch == 'w' || ch == 'x' || ch == 'y') return false;
 206       }
 207       return true;
 208    }
 209 
 210    private final boolean ends(String s)
 211    {  int l = s.length();
 212       int o = k-l+1;
 213       if (o < 0) return false;
 214       for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
 215       j = k-l;
 216       return true;
 217    }
 218 
 219    /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
 220       k. */
 221 
 222    private final void setto(String s)
 223    {  int l = s.length();
 224       int o = j+1;
 225       for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
 226       k = j+l;
 227    }
 228 
 229    /* r(s) is used further down. */
 230 
 231    private final void r(String s) { if (m() > 0) setto(s); }
 232 
 233    /* step1() gets rid of plurals and -ed or -ing. e.g.
 234 
 235           caresses  ->  caress
 236           ponies    ->  poni
 237           ties      ->  ti
 238           caress    ->  caress
 239           cats      ->  cat
 240 
 241           feed      ->  feed
 242           agreed    ->  agree
 243           disabled  ->  disable
 244 
 245           matting   ->  mat
 246           mating    ->  mate
 247           meeting   ->  meet
 248           milling   ->  mill
 249           messing   ->  mess
 250 
 251           meetings  ->  meet
 252 
 253    */
 254 
 255    private final void step1()
 256    {  if (b[k] == 's')
 257       {  if (ends("sses")) k -= 2; else
 258          if (ends("ies")) setto("i"); else
 259          if (b[k-1] != 's') k--;
 260       }
 261       if (ends("eed")) { if (m() > 0) k--; } else
 262       if ((ends("ed") || ends("ing")) && vowelinstem())
 263       {  k = j;
 264          if (ends("at")) setto("ate"); else
 265          if (ends("bl")) setto("ble"); else
 266          if (ends("iz")) setto("ize"); else
 267          if (doublec(k))
 268          {  k--;
 269             {  int ch = b[k];
 270                if (ch == 'l' || ch == 's' || ch == 'z') k++;
 271             }
 272          }
 273          else if (m() == 1 && cvc(k)) setto("e");
 274      }
 275    }
 276 
 277    /* step2() turns terminal y to i when there is another vowel in the stem. */
 278 
 279    private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
 280 
 281    /* step3() maps double suffices to single ones. so -ization ( = -ize plus
 282       -ation) maps to -ize etc. note that the string before the suffix must give
 283       m() > 0. */
 284 
 285    private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])
 286    {
 287        case 'a': if (ends("ational")) { r("ate"); break; }
 288                  if (ends("tional")) { r("tion"); break; }
 289                  break;
 290        case 'c': if (ends("enci")) { r("ence"); break; }
 291                  if (ends("anci")) { r("ance"); break; }
 292                  break;
 293        case 'e': if (ends("izer")) { r("ize"); break; }
 294                  break;
 295        case 'l': if (ends("bli")) { r("ble"); break; }
 296                  if (ends("alli")) { r("al"); break; }
 297                  if (ends("entli")) { r("ent"); break; }
 298                  if (ends("eli")) { r("e"); break; }
 299                  if (ends("ousli")) { r("ous"); break; }
 300                  break;
 301        case 'o': if (ends("ization")) { r("ize"); break; }
 302                  if (ends("ation")) { r("ate"); break; }
 303                  if (ends("ator")) { r("ate"); break; }
 304                  break;
 305        case 's': if (ends("alism")) { r("al"); break; }
 306                  if (ends("iveness")) { r("ive"); break; }
 307                  if (ends("fulness")) { r("ful"); break; }
 308                  if (ends("ousness")) { r("ous"); break; }
 309                  break;
 310        case 't': if (ends("aliti")) { r("al"); break; }
 311                  if (ends("iviti")) { r("ive"); break; }
 312                  if (ends("biliti")) { r("ble"); break; }
 313                  break;
 314        case 'g': if (ends("logi")) { r("log"); break; }
 315    } }
 316 
 317    /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
 318 
 319    private final void step4() { switch (b[k])
 320    {
 321        case 'e': if (ends("icate")) { r("ic"); break; }
 322                  if (ends("ative")) { r(""); break; }
 323                  if (ends("alize")) { r("al"); break; }
 324                  break;
 325        case 'i': if (ends("iciti")) { r("ic"); break; }
 326                  break;
 327        case 'l': if (ends("ical")) { r("ic"); break; }
 328                  if (ends("ful")) { r(""); break; }
 329                  break;
 330        case 's': if (ends("ness")) { r(""); break; }
 331                  break;
 332    } }
 333 
 334    /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
 335 
 336    private final void step5()
 337    {   if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
 338        {  case 'a': if (ends("al")) break; return;
 339           case 'c': if (ends("ance")) break;
 340                     if (ends("ence")) break; return;
 341           case 'e': if (ends("er")) break; return;
 342           case 'i': if (ends("ic")) break; return;
 343           case 'l': if (ends("able")) break;
 344                     if (ends("ible")) break; return;
 345           case 'n': if (ends("ant")) break;
 346                     if (ends("ement")) break;
 347                     if (ends("ment")) break;
 348                     /* element etc. not stripped before the m */
 349                     if (ends("ent")) break; return;
 350           case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
 351                                     /* j >= 0 fixes Bug 2 */
 352                     if (ends("ou")) break; return;
 353                     /* takes care of -ous */
 354           case 's': if (ends("ism")) break; return;
 355           case 't': if (ends("ate")) break;
 356                     if (ends("iti")) break; return;
 357           case 'u': if (ends("ous")) break; return;
 358           case 'v': if (ends("ive")) break; return;
 359           case 'z': if (ends("ize")) break; return;
 360           default: return;
 361        }
 362        if (m() > 1) k = j;
 363    }
 364 
 365    /* step6() removes a final -e if m() > 1. */
 366 
 367    private final void step6()
 368    {  j = k;
 369       if (b[k] == 'e')
 370       {  int a = m();
 371          if (a > 1 || a == 1 && !cvc(k-1)) k--;
 372       }
 373       if (b[k] == 'l' && doublec(k) && m() > 1) k--;
 374    }
 375 
 376    /** Stem the word placed into the Stemmer buffer through calls to add().
 377     * Returns true if the stemming process resulted in a word different
 378     * from the input.  You can retrieve the result with
 379     * getResultLength()/getResultBuffer() or toString().
 380     */
 381    public void stem()
 382    {  k = i - 1;
 383       if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
 384       i_end = k+1; i = 0;
 385    }
 386 
 387    /** Test program for demonstrating the Stemmer.  It reads text from a
 388     * a list of files, stems each word, and writes the result to standard
 389     * output. Note that the word stemmed is expected to be in lower case:
 390     * forcing lower case must be done outside the Stemmer class.
 391     * Usage: Stemmer file-name file-name ...
 392     */
 393    public static void main(String[] args)
 394    {
 395       char[] w = new char[501];
 396       Stemmer s = new Stemmer();
 397       for (int i = 0; i < args.length; i++)
 398       try
 399       {
 400          FileInputStream in = new FileInputStream(args[i]);
 401 
 402          try
 403          { while(true)
 404 
 405            {  int ch = in.read();
 406               if (Character.isLetter((char) ch))
 407               {
 408                  int j = 0;
 409                  while(true)
 410                  {  ch = Character.toLowerCase((char) ch);
 411                     w[j] = (char) ch;
 412                     if (j < 500) j++;
 413                     ch = in.read();
 414                     if (!Character.isLetter((char) ch))
 415                     {
 416                        /* to test add(char ch) */
 417                        for (int c = 0; c < j; c++) s.add(w[c]);
 418 
 419                        /* or, to test add(char[] w, int j) */
 420                        /* s.add(w, j); */
 421 
 422                        s.stem();
 423                        {  String u;
 424 
 425                           /* and now, to test toString() : */
 426                           u = s.toString();
 427 
 428                           /* to test getResultBuffer(), getResultLength() : */
 429                           /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
 430 
 431                           System.out.print(u);
 432                        }
 433                        break;
 434                     }
 435                  }
 436               }
 437               if (ch < 0) break;
 438               System.out.print((char)ch);
 439            }
 440          }
 441          catch (IOException e)
 442          {  System.out.println("error reading " + args[i]);
 443             break;
 444          }
 445       }
 446       catch (FileNotFoundException e)
 447       {  System.out.println("file " + args[i] + " not found");
 448          break;
 449       }
 450    }
 451 }