1 /*
   2  * Copyright (c) 2004, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.tools.javac.parser;
  27 
  28 import com.sun.tools.javac.parser.Tokens.Comment;
  29 import com.sun.tools.javac.parser.Tokens.Comment.CommentStyle;
  30 import com.sun.tools.javac.util.*;
  31 
  32 import java.nio.*;
  33 import java.util.regex.Pattern;
  34 
  35 import static com.sun.tools.javac.util.LayoutCharacters.*;
  36 
  37 /** An extension to the base lexical analyzer that captures
  38  *  and processes the contents of doc comments.  It does so by
  39  *  translating Unicode escape sequences and by stripping the
  40  *  leading whitespace and starts from each line of the comment.
  41  *
  42  *  <p><b>This is NOT part of any supported API.
  43  *  If you write code that depends on this, you do so at your own risk.
  44  *  This code and its internal interfaces are subject to change or
  45  *  deletion without notice.</b>
  46  */
  47 public class JavadocTokenizer extends JavaTokenizer {
  48 
  49     /** Create a scanner from the input buffer.  buffer must implement
  50      *  array() and compact(), and remaining() must be less than limit().
  51      */
  52     protected JavadocTokenizer(ScannerFactory fac, CharBuffer buffer) {
  53         super(fac, buffer);
  54     }
  55 
  56     /** Create a scanner from the input array.  The array must have at
  57      *  least a single character of extra space.
  58      */
  59     protected JavadocTokenizer(ScannerFactory fac, char[] input, int inputLength) {
  60         super(fac, input, inputLength);
  61     }
  62 
  63     @Override
  64     protected Comment processComment(int pos, int endPos, CommentStyle style) {
  65         char[] buf = reader.getRawCharacters(pos, endPos);
  66         return new JavadocComment(new DocReader(fac, buf, buf.length, pos), style);
  67     }
  68 
  69     /**
  70      * This is a specialized version of UnicodeReader that keeps track of the
  71      * column position within a given character stream (used for Javadoc processing),
  72      * and which builds a table for mapping positions in the comment string to
  73      * positions in the source file.
  74      */
  75     static class DocReader extends UnicodeReader {
  76 
  77          int col;
  78          int startPos;
  79 
  80          /**
  81           * A buffer for building a table for mapping positions in {@link #sbuf}
  82           * to positions in the source buffer.
  83           *
  84           * The array is organized as a series of pairs of integers: the first
  85           * number in each pair specifies a position in the comment text,
  86           * the second number in each pair specifies the corresponding position
  87           * in the source buffer. The pairs are sorted in ascending order.
  88           *
  89           * Since the mapping function is generally continuous, with successive
  90           * positions in the string corresponding to successive positions in the
  91           * source buffer, the table only needs to record discontinuities in
  92           * the mapping. The values of intermediate positions can be inferred.
  93           *
  94           * Discontinuities may occur in a number of places: when a newline
  95           * is followed by whitespace and asterisks (which are ignored),
  96           * when a tab is expanded into spaces, and when unicode escapes
  97           * are used in the source buffer.
  98           *
  99           * Thus, to find the source position of any position, p, in the comment
 100           * string, find the index, i, of the pair whose string offset
 101           * ({@code pbuf[i] }) is closest to but not greater than p. Then,
 102           * {@code sourcePos(p) = pbuf[i+1] + (p - pbuf[i]) }.
 103           */
 104          int[] pbuf = new int[128];
 105 
 106          /**
 107           * The index of the next empty slot in the pbuf buffer.
 108           */
 109          int pp = 0;
 110 
 111          /** The buffer index of the last double backslash sequence
 112           */
 113          private int doubleBackslashBp = -1;
 114 
 115          DocReader(ScannerFactory fac, char[] input, int inputLength, int startPos) {
 116              super(fac, input, inputLength);
 117              this.startPos = startPos;
 118          }
 119 
 120          @Override
 121          protected void convertUnicode() {
 122              if (ch == '\\' && unicodeConversionBp != bp) {
 123                  bp++; ch = buf[bp]; col++;
 124                  if (ch == 'u') {
 125                      do {
 126                          bp++; ch = buf[bp]; col++;
 127                      } while (ch == 'u');
 128                      int limit = bp + 3;
 129                      if (limit < buflen) {
 130                          int d = digit(bp, 16);
 131                          int code = d;
 132                          while (bp < limit && d >= 0) {
 133                              bp++; ch = buf[bp]; col++;
 134                              d = digit(bp, 16);
 135                              code = (code << 4) + d;
 136                          }
 137                          if (d >= 0) {
 138                              ch = (char)code;
 139                              unicodeConversionBp = bp;
 140                              return;
 141                          }
 142                      }
 143                      // "illegal.Unicode.esc", reported by base scanner
 144                  } else {
 145                      bp--;
 146                      ch = '\\';
 147                      col--;
 148                  }
 149              }
 150          }
 151 
 152          @Override
 153          protected void scanCommentChar() {
 154              scanChar();
 155              if (ch == '\\') {
 156                  if (peekChar() == '\\' && !isUnicode()) {
 157                      bp++; col++;
 158                      doubleBackslashBp = bp;
 159                  } else {
 160                      convertUnicode();
 161                  }
 162              }
 163          }
 164 
 165          @Override
 166          protected void scanChar() {
 167              bp++;
 168              ch = buf[bp];
 169              switch (ch) {
 170              case '\r': // return
 171                  col = 0;
 172                  break;
 173              case '\n': // newline
 174                  if (bp == 0 || buf[bp-1] != '\r') {
 175                      col = 0;
 176                  }
 177                  break;
 178              case '\t': // tab
 179                  col = (col / TabInc * TabInc) + TabInc;
 180                  break;
 181              case '\\': // possible Unicode
 182                  col++;
 183                  convertUnicode();
 184                  break;
 185              default:
 186                  col++;
 187                  break;
 188              }
 189          }
 190 
 191          @Override
 192          public void putChar(char ch, boolean scan) {
 193              // At this point, bp is the position of the current character in buf,
 194              // and sp is the position in sbuf where this character will be put.
 195              // Record a new entry in pbuf if pbuf is empty or if sp and its
 196              // corresponding source position are not equidistant from the
 197              // corresponding values in the latest entry in the pbuf array.
 198              // (i.e. there is a discontinuity in the map function.)
 199              if ((pp == 0)
 200                      || (sp - pbuf[pp - 2] != (startPos + bp) - pbuf[pp - 1])) {
 201                  if (pp + 1 >= pbuf.length) {
 202                      int[] new_pbuf = new int[pbuf.length * 2];
 203                      System.arraycopy(pbuf, 0, new_pbuf, 0, pbuf.length);
 204                      pbuf = new_pbuf;
 205                  }
 206                  pbuf[pp] = sp;
 207                  pbuf[pp + 1] = startPos + bp;
 208                  pp += 2;
 209              }
 210              super.putChar(ch, scan);
 211          }
 212 
 213          /** Whether the ch represents a sequence of two backslashes. */
 214          boolean isDoubleBackslash() {
 215              return doubleBackslashBp == bp;
 216          }
 217 
 218 
 219      }
 220 
 221      protected static class JavadocComment extends JavaTokenizer.BasicComment<DocReader> {
 222 
 223         /**
 224         * Translated and stripped contents of doc comment
 225         */
 226         private String docComment = null;
 227         private int[] docPosns = null;
 228 
 229         JavadocComment(DocReader reader, CommentStyle cs) {
 230             super(reader, cs);
 231         }
 232 
 233         @Override
 234         public String getText() {
 235             if (!scanned && cs == CommentStyle.JAVADOC) {
 236                 scanDocComment();
 237             }
 238             return docComment;
 239         }
 240 
 241         @Override
 242         public int getSourcePos(int pos) {
 243             // Binary search to find the entry for which the string index is
 244             // less than pos. Since docPosns is a list of pairs of integers
 245             // we must make sure the index is always even.
 246             // If we find an exact match for pos, the other item in the pair
 247             // gives the source pos; otherwise, compute the source position
 248             // relative to the best match found in the array.
 249             if (pos == Position.NOPOS)
 250                 return Position.NOPOS;
 251             if (pos < 0 || pos > docComment.length())
 252                 throw new StringIndexOutOfBoundsException(String.valueOf(pos));
 253             if (docPosns == null)
 254                 return Position.NOPOS;
 255             int start = 0;
 256             int end = docPosns.length;
 257             while (start < end - 2) {
 258                 // find an even index midway between start and end
 259                 int index = ((start  + end) / 4) * 2;
 260                 if (docPosns[index] < pos)
 261                     start = index;
 262                 else if (docPosns[index] == pos)
 263                     return docPosns[index + 1];
 264                 else
 265                     end = index;
 266             }
 267             return docPosns[start + 1] + (pos - docPosns[start]);
 268         }
 269 
 270         @Override
 271         @SuppressWarnings("fallthrough")
 272         protected void scanDocComment() {
 273              try {
 274                  boolean firstLine = true;
 275 
 276                  // Skip over first slash
 277                  comment_reader.scanCommentChar();
 278                  // Skip over first star
 279                  comment_reader.scanCommentChar();
 280 
 281                  // consume any number of stars
 282                  while (comment_reader.bp < comment_reader.buflen && comment_reader.ch == '*') {
 283                      comment_reader.scanCommentChar();
 284                  }
 285                  // is the comment in the form /**/, /***/, /****/, etc. ?
 286                  if (comment_reader.bp < comment_reader.buflen && comment_reader.ch == '/') {
 287                      docComment = "";
 288                      return;
 289                  }
 290 
 291                  // skip a newline on the first line of the comment.
 292                  if (comment_reader.bp < comment_reader.buflen) {
 293                      if (comment_reader.ch == LF) {
 294                          comment_reader.scanCommentChar();
 295                          firstLine = false;
 296                      } else if (comment_reader.ch == CR) {
 297                          comment_reader.scanCommentChar();
 298                          if (comment_reader.ch == LF) {
 299                              comment_reader.scanCommentChar();
 300                              firstLine = false;
 301                          }
 302                      }
 303                  }
 304 
 305              outerLoop:
 306 
 307                  // The outerLoop processes the doc comment, looping once
 308                  // for each line.  For each line, it first strips off
 309                  // whitespace, then it consumes any stars, then it
 310                  // puts the rest of the line into our buffer.
 311                  while (comment_reader.bp < comment_reader.buflen) {
 312                      int begin_bp = comment_reader.bp;
 313                      char begin_ch = comment_reader.ch;
 314                      // The wsLoop consumes whitespace from the beginning
 315                      // of each line.
 316                  wsLoop:
 317 
 318                      while (comment_reader.bp < comment_reader.buflen) {
 319                          switch(comment_reader.ch) {
 320                          case ' ':
 321                              comment_reader.scanCommentChar();
 322                              break;
 323                          case '\t':
 324                              comment_reader.col = ((comment_reader.col - 1) / TabInc * TabInc) + TabInc;
 325                              comment_reader.scanCommentChar();
 326                              break;
 327                          case FF:
 328                              comment_reader.col = 0;
 329                              comment_reader.scanCommentChar();
 330                              break;
 331          // Treat newline at beginning of line (blank line, no star)
 332          // as comment text.  Old Javadoc compatibility requires this.
 333          /*---------------------------------*
 334                          case CR: // (Spec 3.4)
 335                              doc_reader.scanCommentChar();
 336                              if (ch == LF) {
 337                                  col = 0;
 338                                  doc_reader.scanCommentChar();
 339                              }
 340                              break;
 341                          case LF: // (Spec 3.4)
 342                              doc_reader.scanCommentChar();
 343                              break;
 344          *---------------------------------*/
 345                          default:
 346                              // we've seen something that isn't whitespace;
 347                              // jump out.
 348                              break wsLoop;
 349                          }
 350                      }
 351 
 352                      // Are there stars here?  If so, consume them all
 353                      // and check for the end of comment.
 354                      if (comment_reader.ch == '*') {
 355                          // skip all of the stars
 356                          do {
 357                              comment_reader.scanCommentChar();
 358                          } while (comment_reader.ch == '*');
 359 
 360                          // check for the closing slash.
 361                          if (comment_reader.ch == '/') {
 362                              // We're done with the doc comment
 363                              // scanChar() and breakout.
 364                              break outerLoop;
 365                          }
 366                      } else if (! firstLine) {
 367                          // The current line does not begin with a '*' so we will
 368                          // treat it as comment
 369                          comment_reader.bp = begin_bp;
 370                          comment_reader.ch = begin_ch;
 371                      }
 372                      // The textLoop processes the rest of the characters
 373                      // on the line, adding them to our buffer.
 374                  textLoop:
 375                      while (comment_reader.bp < comment_reader.buflen) {
 376                          switch (comment_reader.ch) {
 377                          case '*':
 378                              // Is this just a star?  Or is this the
 379                              // end of a comment?
 380                              comment_reader.scanCommentChar();
 381                              if (comment_reader.ch == '/') {
 382                                  // This is the end of the comment,
 383                                  // set ch and return our buffer.
 384                                  break outerLoop;
 385                              }
 386                              // This is just an ordinary star.  Add it to
 387                              // the buffer.
 388                              comment_reader.putChar('*', false);
 389                              break;
 390                          case '\\':
 391                              comment_reader.putChar('\\', false);
 392                              // If a double backslash was found, write two
 393                              if (comment_reader.isDoubleBackslash()) {
 394                                  comment_reader.putChar('\\', false);
 395                              }
 396                              comment_reader.scanCommentChar();
 397                              break;
 398                          case ' ':
 399                          case '\t':
 400                              comment_reader.putChar(comment_reader.ch, false);
 401                              comment_reader.scanCommentChar();
 402                              break;
 403                          case FF:
 404                              comment_reader.scanCommentChar();
 405                              break textLoop; // treat as end of line
 406                          case CR: // (Spec 3.4)
 407                              comment_reader.scanCommentChar();
 408                              if (comment_reader.ch != LF) {
 409                                  // Canonicalize CR-only line terminator to LF
 410                                  comment_reader.putChar((char)LF, false);
 411                                  break textLoop;
 412                              }
 413                              /* fall through to LF case */
 414                          case LF: // (Spec 3.4)
 415                              // We've seen a newline.  Add it to our
 416                              // buffer and break out of this loop,
 417                              // starting fresh on a new line.
 418                              comment_reader.putChar(comment_reader.ch, false);
 419                              comment_reader.scanCommentChar();
 420                              break textLoop;
 421                          default:
 422                              // Add the character to our buffer.
 423                              comment_reader.putChar(comment_reader.ch, false);
 424                              comment_reader.scanCommentChar();
 425                          }
 426                      } // end textLoop
 427                      firstLine = false;
 428                  } // end outerLoop
 429 
 430                  if (comment_reader.sp > 0) {
 431                      int i = comment_reader.sp - 1;
 432                  trailLoop:
 433                      while (i > -1) {
 434                          switch (comment_reader.sbuf[i]) {
 435                          case '*':
 436                              i--;
 437                              break;
 438                          default:
 439                              break trailLoop;
 440                          }
 441                      }
 442                      comment_reader.sp = i + 1;
 443 
 444                      // Store the text of the doc comment
 445                     docComment = comment_reader.chars();
 446                     docPosns = new int[comment_reader.pp];
 447                     System.arraycopy(comment_reader.pbuf, 0, docPosns, 0, docPosns.length);
 448                 } else {
 449                     docComment = "";
 450                 }
 451             } finally {
 452                 scanned = true;
 453                 comment_reader = null;
 454                 if (docComment != null &&
 455                         DEPRECATED_PATTERN.matcher(docComment).matches()) {
 456                     deprecatedFlag = true;
 457                 }
 458             }
 459         }
 460         //where:
 461             private static final Pattern DEPRECATED_PATTERN =
 462                     Pattern.compile("(?sm).*^\\s*@deprecated( |$).*");
 463 
 464     }
 465 
 466     @Override
 467     public Position.LineMap getLineMap() {
 468         char[] buf = reader.getRawCharacters();
 469         return Position.makeLineMap(buf, buf.length, true);
 470     }
 471 }