New src/share/demo/lambda/BulkDataOperations/src/CSVProcessor.java

   1 /*
   2  * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  *
   8  *   - Redistributions of source code must retain the above copyright
   9  *     notice, this list of conditions and the following disclaimer.
  10  *
  11  *   - Redistributions in binary form must reproduce the above copyright
  12  *     notice, this list of conditions and the following disclaimer in the
  13  *     documentation and/or other materials provided with the distribution.
  14  *
  15  *   - Neither the name of Oracle nor the names of its
  16  *     contributors may be used to endorse or promote products derived
  17  *     from this software without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
  20  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  26  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  27  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  28  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  */
  31 
  32 /*
  33  * This source code is provided to illustrate the usage of a given feature
  34  * or technique and has been deliberately simplified. Additional steps
  35  * required for a production-quality application, such as security checks,
  36  * input validation and proper error handling, might not be present in
  37  * this sample code.
  38  */
  39 import java.io.BufferedReader;
  40 import java.io.IOException;
  41 import java.nio.charset.StandardCharsets;
  42 import java.nio.file.Files;
  43 import java.nio.file.Paths;
  44 import java.util.function.BinaryOperator;
  45 import java.util.function.Predicate;
  46 import java.util.function.Supplier;
  47 import java.util.regex.Pattern;
  48 import java.util.stream.Collector;
  49 import java.util.stream.Collectors;
  50 import static java.lang.Double.parseDouble;
  51 import java.util.Arrays;
  52 import java.util.Comparator;
  53 import java.util.DoubleSummaryStatistics;
  54 import java.util.EnumSet;
  55 import java.util.List;
  56 import java.util.Set;
  57 import java.util.TreeSet;
  58 import java.util.function.BiConsumer;
  59 import java.util.function.Function;
  60 
  61 /**
  62  * CSVProcessor is a tool for processing CSV file. There are several command
  63  * line options. Please consult printUsageAndExit(...) method for more info.
  64  * This sample shows examples of using next features:
  65  * <ol>
  66  * <li>Lambda and bulk operations. Working with streams: map(...), filter(...),
  67  * sorted(...) methods. collect(...) method with different collectors:
  68  * Collectors.maxBy(...), Collectors.minBy(...), Collectors.toList(),
  69  * Collectors.toCollection(...), Collectors.groupingBy(...),
  70  * Collectors.toDoubleSummaryStatistics(...), a custom Collector.</li>
  71  * <li>Static method reference for printing values.</li>
  72  * <li>Try-with-resources feature for closing files.</li>
  73  * <li>Switch by String feature.
  74  * <li>Other new API: Pattern.asPredicate(), BinaryOperator
  75  * BufferedReader.lines(), Collection.forEach(...), Comparator.comparing(...),
  76  * Comparator.reversed(), Arrays.stream(...).</li>
  77  * </ol>
  78  *
  79  * @author Andrey Nazarov
  80  */
  81 public class CSVProcessor {
  82 
  83     //Number of characters that may be read
  84     private static final int READ_AHEAD_LIMIT = 100_000_000;
  85 
  86     /**
  87      * The main method for the CSVProcessor program. Run program with empty
  88      * argument list to see possible arguments.
  89      *
  90      * @param args the argument list for CSVProcessor.
  91      */
  92     public static void main(String[] args) {
  93         if (args.length < 2) {
  94             printUsageAndExit();
  95         }
  96         try (BufferedReader br = new BufferedReader(
  97                 Files.newBufferedReader(Paths.get(args[args.length - 1]),
  98                         StandardCharsets.UTF_8))) {
  99             //assume first line contains column names
 100             List<String> header = Arrays.stream(br.readLine().split(","))
 101                     .map(String::trim).collect(Collectors.toList());
 102             //calculate an index of the column in question
 103             int column = getColumnNumber(header, args[1]);
 104             switch (args[0]) {
 105                 case "sort":
 106                     verifyArgumentNumber(args, 4);
 107                     //define sort order
 108                     boolean isAsc;
 109                     switch (args[2].toUpperCase()) {
 110                         case "ASC":
 111                             isAsc = true;
 112                             break;
 113                         case "DESC":
 114                             isAsc = false;
 115                             break;
 116                         default:
 117                             printUsageAndExit("Illegal argument" + args[2]);
 118                             return;//should not be reached
 119                     }
 120                     /*
 121                      * create comparator that compares lines by comparing values
 122                      * in the specified column.
 123                      */
 124                     Comparator<String> cmp
 125                             = Comparator.comparing(str -> getCell(str, column),
 126                                     String.CASE_INSENSITIVE_ORDER);
 127                     /*
 128                      * sorted(...) is used to sort records.
 129                      * forEach(...) is used to output sorted records.
 130                      */
 131                     br.lines().sorted(isAsc ? cmp : cmp.reversed())
 132                             .forEach(System.out::println);
 133                     break;
 134                 case "search":
 135                     verifyArgumentNumber(args, 4);
 136                     /*
 137                      * records are filtered by a regex.
 138                      * forEach(...) is used to output filtered records.
 139                      */
 140                     Predicate<String> pattern
 141                             = Pattern.compile(args[2]).asPredicate();
 142                     br.lines().filter(str -> pattern.test(getCell(str, column)))
 143                             .forEach(System.out::println);
 144                     break;
 145                 case "groupby":
 146                     verifyArgumentNumber(args, 3);
 147                     /*
 148                      * group lines by values in the column with collect(...),
 149                      * print with forEach(...) for every distinct value within
 150                      * the column.
 151                      */
 152                     br.lines().collect(
 153                             Collectors.groupingBy(
 154                                     str -> getCell(str, column),
 155                                     Collectors.toCollection(TreeSet::new)))
 156                             .forEach((str, set) -> {
 157                                 System.out.println(str + ":");
 158                                 set.forEach(System.out::println);
 159                             });
 160                     break;
 161                 case "stat":
 162                     verifyArgumentNumber(args, 3);
 163 
 164                     /*
 165                      * BufferedReader will be read several times.
 166                      * We mark this point to return here after each pass.
 167                      */
 168                     br.mark(READ_AHEAD_LIMIT);
 169 
 170                     //Statistics can be collected by a custom collector in one pass
 171                     System.out.println(
 172                             br.lines().collect(new Statistics(column)));
 173                     br.reset();
 174 
 175                     /*
 176                      * Alternatively, statistics can be collected
 177                      * by built-in API in several passes.
 178                      */
 179                     statInSeveralPasses(br, column);
 180                     break;
 181                 default:
 182                     printUsageAndExit("Illegal argument" + args[0]);
 183             }
 184         } catch (IOException e) {
 185             printUsageAndExit(e.toString());
 186         }
 187     }
 188 
 189     private static void statInSeveralPasses(BufferedReader br, int column)
 190             throws IOException {
 191         System.out.println("#-----Statistic in several passes-------#");
 192         //create comparator to compare records by the column.
 193         Comparator<String> comparator
 194                 = Comparator.comparing(
 195                         (String str) -> parseDouble(getCell(str, column)));
 196         //find max record by Collectors.maxBy(...)
 197         System.out.println(
 198                 "Max: " + br.lines().collect(Collectors.maxBy(comparator)));
 199         br.reset();
 200         //find min record by Collectors.minBy(...)
 201         System.out.println(
 202                 "Min: " + br.lines().collect(Collectors.minBy(comparator)));
 203         br.reset();
 204         //Compute average value and sum with Collectors.toDoubleSummaryStatistics(...)
 205         DoubleSummaryStatistics doubleSummaryStatistics
 206                 = br.lines().collect(
 207                         Collectors.summarizingDouble(
 208                                 str -> parseDouble(getCell(str, column))));
 209         System.out.println("Average: " + doubleSummaryStatistics.getAverage());
 210         System.out.println("Sum: " + doubleSummaryStatistics.getSum());
 211     }
 212 
 213     private static void verifyArgumentNumber(String[] args, int n) {
 214         if (args.length != n) {
 215             printUsageAndExit("Expected " + n + " arguments but was "
 216                     + args.length);
 217         }
 218     }
 219 
 220     private static int getColumnNumber(List<String> header, String name) {
 221         int column = header.indexOf(name);
 222         if (column == -1) {
 223             printUsageAndExit("There is no column with name " + name);
 224         }
 225         return column;
 226     }
 227 
 228     private static String getCell(String record, int column) {
 229         return record.split(",")[column].trim();
 230     }
 231 
 232     private static void printUsageAndExit(String... str) {
 233         System.out.println("Usages:");
 234 
 235         System.out.println("CSVProcessor sort COLUMN_NAME ASC|DESC FILE");
 236         System.out.println("Sort lines by column COLUMN_NAME in CSV FILE\n");
 237 
 238         System.out.println("CSVProcessor search COLUMN_NAME REGEX FILE");
 239         System.out.println("Search for REGEX in column COLUMN_NAME in CSV FILE\n");
 240 
 241         System.out.println("CSVProcessor groupby COLUMN_NAME FILE");
 242         System.out.println("Split lines into different groups according column "
 243                 + "COLUMN_NAME value\n");
 244 
 245         System.out.println("CSVProcessor stat COLUMN_NAME FILE");
 246         System.out.println("Compute max/min/average/sum  statistics by column "
 247                 + "COLUMN_NAME\n");
 248 
 249         Arrays.asList(str).forEach(System.out::println);
 250         System.exit(1);
 251     }
 252 
 253     /*
 254      * This is custom implementation of Collector interface.
 255      * Statitics objects gather max,min,sum,average statistics.
 256      */
 257     private static class Statistics
 258             implements Collector<String, Statistics, Statistics> {
 259 
 260 
 261         /*
 262          * @implNote This implementation is not thread safe.
 263          * However, it is safe to use Statistics on a parallel stream, because
 264          * the parallel implementation of
 265          * {@link java.util.stream.Stream#collect Stream.collect()}
 266          * provides the necessary partitioning, isolation, and merging of results for
 267          * safe and efficient parallel execution.
 268          */
 269         private String maxRecord;
 270         private String minRecord;
 271 
 272         private double sum;
 273         private int lineCount;
 274         private final BinaryOperator<String> maxOperator;
 275         private final BinaryOperator<String> minOperator;
 276         private final int column;
 277 
 278         public Statistics(int column) {
 279             this.column = column;
 280             Comparator<String> cmp = Comparator.comparing(
 281                     (String str) -> parseDouble(getCell(str, column)));
 282             maxOperator = BinaryOperator.maxBy(cmp);
 283             minOperator = BinaryOperator.minBy(cmp);
 284         }
 285 
 286         /*
 287          * Process line
 288          */
 289         public Statistics accept(String line) {
 290             maxRecord = maxRecord == null
 291                     ? line : maxOperator.apply(maxRecord, line);
 292             minRecord = minRecord == null
 293                     ? line : minOperator.apply(minRecord, line);
 294 
 295             sum += parseDouble(getCell(line, column));
 296             lineCount++;
 297             return this;
 298         }
 299 
 300 
 301         /*
 302          * Merge two Statistics
 303          */
 304         public Statistics combine(Statistics stat) {
 305             maxRecord = maxOperator.apply(maxRecord, stat.getMaxRecord());
 306             minRecord = minOperator.apply(minRecord, stat.getMinRecord());
 307             sum += stat.getSum();
 308             lineCount += stat.getLineCount();
 309             return this;
 310         }
 311 
 312         @Override
 313         public String toString() {
 314             StringBuilder sb = new StringBuilder();
 315             sb.append("#------Statistics------#\n");
 316             sb.append("Max: ").append(getMaxRecord()).append("\n");
 317             sb.append("Min: ").append(getMinRecord()).append("\n");
 318             sb.append("Sum = ").append(getSum()).append("\n");
 319             sb.append("Average = ").append(average()).append("\n");
 320             sb.append("#------Statistics------#\n");
 321             return sb.toString();
 322         }
 323 
 324         @Override
 325         public Supplier<Statistics> supplier() {
 326             return () -> new Statistics(column);
 327         }
 328 
 329         @Override
 330         public BiConsumer<Statistics, String> accumulator() {
 331             return Statistics::accept;
 332         }
 333 
 334         @Override
 335         public BinaryOperator<Statistics> combiner() {
 336             return Statistics::combine;
 337 
 338         }
 339 
 340         @Override
 341         public Function<Statistics, Statistics> finisher() {
 342             return stat -> stat;
 343         }
 344 
 345         @Override
 346         public Set<Characteristics> characteristics() {
 347             return EnumSet.of(Characteristics.IDENTITY_FINISH);
 348         }
 349 
 350         private String getMaxRecord() {
 351             return maxRecord;
 352         }
 353 
 354         private String getMinRecord() {
 355             return minRecord;
 356         }
 357 
 358         private double getSum() {
 359             return sum;
 360         }
 361 
 362         private double average() {
 363             return sum / lineCount;
 364         }
 365 
 366         private int getLineCount() {
 367             return lineCount;
 368         }
 369 
 370     }
 371 
 372 }