1 /** 2 * @test 3 * @bug 7070134 4 * @summary Hotspot crashes with sigsegv from PorterStemmer 5 * @modules java.base/jdk.internal.misc 6 * @library /testlibrary 7 * 8 * @run driver jdk.test.lib.FileInstaller words words 9 * @run main/othervm -Xbatch compiler.c2.stemmer.Stemmer words 10 */ 11 12 /* 13 14 Porter stemmer in Java. The original paper is in 15 16 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 17 no. 3, pp 130-137, 18 19 http://www.tartarus.org/~martin/PorterStemmer 20 21 The software is completely free for any purpose, unless notes at the head 22 of the program text indicates otherwise (which is rare). In any case, 23 the notes about licensing are never more restrictive than the BSD License. 24 25 In every case where the software is not written by me (Martin Porter), 26 this licensing arrangement has been endorsed by the contributor, and it is 27 therefore unnecessary to ask the contributor again to confirm it. 28 29 I have not asked any contributors (or their employers, if they have them) 30 for proofs that they have the right to distribute their software in this way. 31 32 History: 33 34 Release 1 35 36 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. 37 The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] 38 is then out outside the bounds of b. 39 40 Release 2 41 42 Similarly, 43 44 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. 45 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and 46 b[j] is then outside the bounds of b. 47 48 Release 3 49 50 Considerably revised 4/9/00 in the light of many helpful suggestions 51 from Brian Goetz of Quiotix Corporation (brian@quiotix.com). 52 53 Release 4 54 55 */ 56 57 package compiler.c2.stemmer; 58 59 import java.io.FileInputStream; 60 import java.io.FileNotFoundException; 61 import java.io.IOException; 62 63 /** 64 * Stemmer, implementing the Porter Stemming Algorithm 65 * 66 * The Stemmer class transforms a word into its root form. The input 67 * word can be provided a character at time (by calling add()), or at once 68 * by calling one of the various stem(something) methods. 69 */ 70 71 public class Stemmer 72 { private char[] b; 73 private int i, /* offset into b */ 74 i_end, /* offset to end of stemmed word */ 75 j, k; 76 private static final int INC = 50; 77 /* unit of size whereby b is increased */ 78 public Stemmer() 79 { b = new char[INC]; 80 i = 0; 81 i_end = 0; 82 } 83 84 /** 85 * Add a character to the word being stemmed. When you are finished 86 * adding characters, you can call stem(void) to stem the word. 87 */ 88 89 public void add(char ch) 90 { if (i == b.length) 91 { char[] new_b = new char[i+INC]; 92 for (int c = 0; c < i; c++) new_b[c] = b[c]; 93 b = new_b; 94 } 95 b[i++] = ch; 96 } 97 98 99 /** Adds wLen characters to the word being stemmed contained in a portion 100 * of a char[] array. This is like repeated calls of add(char ch), but 101 * faster. 102 */ 103 104 public void add(char[] w, int wLen) 105 { if (i+wLen >= b.length) 106 { char[] new_b = new char[i+wLen+INC]; 107 for (int c = 0; c < i; c++) new_b[c] = b[c]; 108 b = new_b; 109 } 110 for (int c = 0; c < wLen; c++) b[i++] = w[c]; 111 } 112 113 /** 114 * After a word has been stemmed, it can be retrieved by toString(), 115 * or a reference to the internal buffer can be retrieved by getResultBuffer 116 * and getResultLength (which is generally more efficient.) 117 */ 118 public String toString() { return new String(b,0,i_end); } 119 120 /** 121 * Returns the length of the word resulting from the stemming process. 122 */ 123 public int getResultLength() { return i_end; } 124 125 /** 126 * Returns a reference to a character buffer containing the results of 127 * the stemming process. You also need to consult getResultLength() 128 * to determine the length of the result. 129 */ 130 public char[] getResultBuffer() { return b; } 131 132 /* cons(i) is true <=> b[i] is a consonant. */ 133 134 private final boolean cons(int i) 135 { switch (b[i]) 136 { case 'a': case 'e': case 'i': case 'o': case 'u': return false; 137 case 'y': return (i==0) ? true : !cons(i-1); 138 default: return true; 139 } 140 } 141 142 /* m() measures the number of consonant sequences between 0 and j. if c is 143 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 144 presence, 145 146 <c><v> gives 0 147 <c>vc<v> gives 1 148 <c>vcvc<v> gives 2 149 <c>vcvcvc<v> gives 3 150 .... 151 */ 152 153 private final int m() 154 { int n = 0; 155 int i = 0; 156 while(true) 157 { if (i > j) return n; 158 if (! cons(i)) break; i++; 159 } 160 i++; 161 while(true) 162 { while(true) 163 { if (i > j) return n; 164 if (cons(i)) break; 165 i++; 166 } 167 i++; 168 n++; 169 while(true) 170 { if (i > j) return n; 171 if (! cons(i)) break; 172 i++; 173 } 174 i++; 175 } 176 } 177 178 /* vowelinstem() is true <=> 0,...j contains a vowel */ 179 180 private final boolean vowelinstem() 181 { int i; for (i = 0; i <= j; i++) if (! cons(i)) return true; 182 return false; 183 } 184 185 /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ 186 187 private final boolean doublec(int j) 188 { if (j < 1) return false; 189 if (b[j] != b[j-1]) return false; 190 return cons(j); 191 } 192 193 /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant 194 and also if the second c is not w,x or y. this is used when trying to 195 restore an e at the end of a short word. e.g. 196 197 cav(e), lov(e), hop(e), crim(e), but 198 snow, box, tray. 199 200 */ 201 202 private final boolean cvc(int i) 203 { if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false; 204 { int ch = b[i]; 205 if (ch == 'w' || ch == 'x' || ch == 'y') return false; 206 } 207 return true; 208 } 209 210 private final boolean ends(String s) 211 { int l = s.length(); 212 int o = k-l+1; 213 if (o < 0) return false; 214 for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false; 215 j = k-l; 216 return true; 217 } 218 219 /* setto(s) sets (j+1),...k to the characters in the string s, readjusting 220 k. */ 221 222 private final void setto(String s) 223 { int l = s.length(); 224 int o = j+1; 225 for (int i = 0; i < l; i++) b[o+i] = s.charAt(i); 226 k = j+l; 227 } 228 229 /* r(s) is used further down. */ 230 231 private final void r(String s) { if (m() > 0) setto(s); } 232 233 /* step1() gets rid of plurals and -ed or -ing. e.g. 234 235 caresses -> caress 236 ponies -> poni 237 ties -> ti 238 caress -> caress 239 cats -> cat 240 241 feed -> feed 242 agreed -> agree 243 disabled -> disable 244 245 matting -> mat 246 mating -> mate 247 meeting -> meet 248 milling -> mill 249 messing -> mess 250 251 meetings -> meet 252 253 */ 254 255 private final void step1() 256 { if (b[k] == 's') 257 { if (ends("sses")) k -= 2; else 258 if (ends("ies")) setto("i"); else 259 if (b[k-1] != 's') k--; 260 } 261 if (ends("eed")) { if (m() > 0) k--; } else 262 if ((ends("ed") || ends("ing")) && vowelinstem()) 263 { k = j; 264 if (ends("at")) setto("ate"); else 265 if (ends("bl")) setto("ble"); else 266 if (ends("iz")) setto("ize"); else 267 if (doublec(k)) 268 { k--; 269 { int ch = b[k]; 270 if (ch == 'l' || ch == 's' || ch == 'z') k++; 271 } 272 } 273 else if (m() == 1 && cvc(k)) setto("e"); 274 } 275 } 276 277 /* step2() turns terminal y to i when there is another vowel in the stem. */ 278 279 private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; } 280 281 /* step3() maps double suffices to single ones. so -ization ( = -ize plus 282 -ation) maps to -ize etc. note that the string before the suffix must give 283 m() > 0. */ 284 285 private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1]) 286 { 287 case 'a': if (ends("ational")) { r("ate"); break; } 288 if (ends("tional")) { r("tion"); break; } 289 break; 290 case 'c': if (ends("enci")) { r("ence"); break; } 291 if (ends("anci")) { r("ance"); break; } 292 break; 293 case 'e': if (ends("izer")) { r("ize"); break; } 294 break; 295 case 'l': if (ends("bli")) { r("ble"); break; } 296 if (ends("alli")) { r("al"); break; } 297 if (ends("entli")) { r("ent"); break; } 298 if (ends("eli")) { r("e"); break; } 299 if (ends("ousli")) { r("ous"); break; } 300 break; 301 case 'o': if (ends("ization")) { r("ize"); break; } 302 if (ends("ation")) { r("ate"); break; } 303 if (ends("ator")) { r("ate"); break; } 304 break; 305 case 's': if (ends("alism")) { r("al"); break; } 306 if (ends("iveness")) { r("ive"); break; } 307 if (ends("fulness")) { r("ful"); break; } 308 if (ends("ousness")) { r("ous"); break; } 309 break; 310 case 't': if (ends("aliti")) { r("al"); break; } 311 if (ends("iviti")) { r("ive"); break; } 312 if (ends("biliti")) { r("ble"); break; } 313 break; 314 case 'g': if (ends("logi")) { r("log"); break; } 315 } } 316 317 /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ 318 319 private final void step4() { switch (b[k]) 320 { 321 case 'e': if (ends("icate")) { r("ic"); break; } 322 if (ends("ative")) { r(""); break; } 323 if (ends("alize")) { r("al"); break; } 324 break; 325 case 'i': if (ends("iciti")) { r("ic"); break; } 326 break; 327 case 'l': if (ends("ical")) { r("ic"); break; } 328 if (ends("ful")) { r(""); break; } 329 break; 330 case 's': if (ends("ness")) { r(""); break; } 331 break; 332 } } 333 334 /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ 335 336 private final void step5() 337 { if (k == 0) return; /* for Bug 1 */ switch (b[k-1]) 338 { case 'a': if (ends("al")) break; return; 339 case 'c': if (ends("ance")) break; 340 if (ends("ence")) break; return; 341 case 'e': if (ends("er")) break; return; 342 case 'i': if (ends("ic")) break; return; 343 case 'l': if (ends("able")) break; 344 if (ends("ible")) break; return; 345 case 'n': if (ends("ant")) break; 346 if (ends("ement")) break; 347 if (ends("ment")) break; 348 /* element etc. not stripped before the m */ 349 if (ends("ent")) break; return; 350 case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; 351 /* j >= 0 fixes Bug 2 */ 352 if (ends("ou")) break; return; 353 /* takes care of -ous */ 354 case 's': if (ends("ism")) break; return; 355 case 't': if (ends("ate")) break; 356 if (ends("iti")) break; return; 357 case 'u': if (ends("ous")) break; return; 358 case 'v': if (ends("ive")) break; return; 359 case 'z': if (ends("ize")) break; return; 360 default: return; 361 } 362 if (m() > 1) k = j; 363 } 364 365 /* step6() removes a final -e if m() > 1. */ 366 367 private final void step6() 368 { j = k; 369 if (b[k] == 'e') 370 { int a = m(); 371 if (a > 1 || a == 1 && !cvc(k-1)) k--; 372 } 373 if (b[k] == 'l' && doublec(k) && m() > 1) k--; 374 } 375 376 /** Stem the word placed into the Stemmer buffer through calls to add(). 377 * Returns true if the stemming process resulted in a word different 378 * from the input. You can retrieve the result with 379 * getResultLength()/getResultBuffer() or toString(). 380 */ 381 public void stem() 382 { k = i - 1; 383 if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); } 384 i_end = k+1; i = 0; 385 } 386 387 /** Test program for demonstrating the Stemmer. It reads text from a 388 * a list of files, stems each word, and writes the result to standard 389 * output. Note that the word stemmed is expected to be in lower case: 390 * forcing lower case must be done outside the Stemmer class. 391 * Usage: Stemmer file-name file-name ... 392 */ 393 public static void main(String[] args) 394 { 395 char[] w = new char[501]; 396 Stemmer s = new Stemmer(); 397 for (int i = 0; i < args.length; i++) 398 try 399 { 400 FileInputStream in = new FileInputStream(args[i]); 401 402 try 403 { while(true) 404 405 { int ch = in.read(); 406 if (Character.isLetter((char) ch)) 407 { 408 int j = 0; 409 while(true) 410 { ch = Character.toLowerCase((char) ch); 411 w[j] = (char) ch; 412 if (j < 500) j++; 413 ch = in.read(); 414 if (!Character.isLetter((char) ch)) 415 { 416 /* to test add(char ch) */ 417 for (int c = 0; c < j; c++) s.add(w[c]); 418 419 /* or, to test add(char[] w, int j) */ 420 /* s.add(w, j); */ 421 422 s.stem(); 423 { String u; 424 425 /* and now, to test toString() : */ 426 u = s.toString(); 427 428 /* to test getResultBuffer(), getResultLength() : */ 429 /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */ 430 431 System.out.print(u); 432 } 433 break; 434 } 435 } 436 } 437 if (ch < 0) break; 438 System.out.print((char)ch); 439 } 440 } 441 catch (IOException e) 442 { System.out.println("error reading " + args[i]); 443 break; 444 } 445 } 446 catch (FileNotFoundException e) 447 { System.out.println("file " + args[i] + " not found"); 448 break; 449 } 450 } 451 }