--- /dev/null 2013-10-11 17:30:01.735334065 +0400 +++ new/src/share/demo/lambda/BulkDataOperations/src/CSVProcessor.java 2013-10-11 17:36:51.180870800 +0400 @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of Oracle nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This source code is provided to illustrate the usage of a given feature + * or technique and has been deliberately simplified. Additional steps + * required for a production-quality application, such as security checks, + * input validation and proper error handling, might not be present in + * this sample code. + */ +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.function.BinaryOperator; +import java.util.function.Predicate; +import java.util.function.Supplier; +import java.util.regex.Pattern; +import java.util.stream.Collector; +import java.util.stream.Collectors; +import static java.lang.Double.parseDouble; +import java.util.Arrays; +import java.util.Comparator; +import java.util.DoubleSummaryStatistics; +import java.util.EnumSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.BiConsumer; +import java.util.function.Function; + +/** + * CSVProcessor is a tool for processing CSV file. There are several command + * line options. Please consult printUsageAndExit(...) method for more info. + * This sample shows examples of using next features: + *
    + *
  1. Lambda and bulk operations. Working with streams: map(...), filter(...), + * sorted(...) methods. collect(...) method with different collectors: + * Collectors.maxBy(...), Collectors.minBy(...), Collectors.toList(), + * Collectors.toCollection(...), Collectors.groupingBy(...), + * Collectors.toDoubleSummaryStatistics(...), a custom Collector.
  2. + *
  3. Static method reference for printing values.
  4. + *
  5. Try-with-resources feature for closing files.
  6. + *
  7. Switch by String feature. + *
  8. Other new API: Pattern.asPredicate(), BinaryOperator + * BufferedReader.lines(), Collection.forEach(...), Comparator.comparing(...), + * Comparator.reversed(), Arrays.stream(...).
  9. + *
+ * + * @author Andrey Nazarov + */ +public class CSVProcessor { + + //Number of characters that may be read + private static final int READ_AHEAD_LIMIT = 100_000_000; + + /** + * The main method for the CSVProcessor program. Run program with empty + * argument list to see possible arguments. + * + * @param args the argument list for CSVProcessor. + */ + public static void main(String[] args) { + if (args.length < 2) { + printUsageAndExit(); + } + try (BufferedReader br = new BufferedReader( + Files.newBufferedReader(Paths.get(args[args.length - 1]), + StandardCharsets.UTF_8))) { + //assume first line contains column names + List header = Arrays.stream(br.readLine().split(",")) + .map(String::trim).collect(Collectors.toList()); + //calculate an index of the column in question + int column = getColumnNumber(header, args[1]); + switch (args[0]) { + case "sort": + verifyArgumentNumber(args, 4); + //define sort order + boolean isAsc; + switch (args[2].toUpperCase()) { + case "ASC": + isAsc = true; + break; + case "DESC": + isAsc = false; + break; + default: + printUsageAndExit("Illegal argument" + args[2]); + return;//should not be reached + } + /* + * create comparator that compares lines by comparing values + * in the specified column. + */ + Comparator cmp + = Comparator.comparing(str -> getCell(str, column), + String.CASE_INSENSITIVE_ORDER); + /* + * sorted(...) is used to sort records. + * forEach(...) is used to output sorted records. + */ + br.lines().sorted(isAsc ? cmp : cmp.reversed()) + .forEach(System.out::println); + break; + case "search": + verifyArgumentNumber(args, 4); + /* + * records are filtered by a regex. + * forEach(...) is used to output filtered records. + */ + Predicate pattern + = Pattern.compile(args[2]).asPredicate(); + br.lines().filter(str -> pattern.test(getCell(str, column))) + .forEach(System.out::println); + break; + case "groupby": + verifyArgumentNumber(args, 3); + /* + * group lines by values in the column with collect(...), + * print with forEach(...) for every distinct value within + * the column. + */ + br.lines().collect( + Collectors.groupingBy( + str -> getCell(str, column), + Collectors.toCollection(TreeSet::new))) + .forEach((str, set) -> { + System.out.println(str + ":"); + set.forEach(System.out::println); + }); + break; + case "stat": + verifyArgumentNumber(args, 3); + + /* + * BufferedReader will be read several times. + * We mark this point to return here after each pass. + */ + br.mark(READ_AHEAD_LIMIT); + + //Statistics can be collected by a custom collector in one pass + System.out.println( + br.lines().collect(new Statistics(column))); + br.reset(); + + /* + * Alternatively, statistics can be collected + * by built-in API in several passes. + */ + statInSeveralPasses(br, column); + break; + default: + printUsageAndExit("Illegal argument" + args[0]); + } + } catch (IOException e) { + printUsageAndExit(e.toString()); + } + } + + private static void statInSeveralPasses(BufferedReader br, int column) + throws IOException { + System.out.println("#-----Statistic in several passes-------#"); + //create comparator to compare records by the column. + Comparator comparator + = Comparator.comparing( + (String str) -> parseDouble(getCell(str, column))); + //find max record by Collectors.maxBy(...) + System.out.println( + "Max: " + br.lines().collect(Collectors.maxBy(comparator))); + br.reset(); + //find min record by Collectors.minBy(...) + System.out.println( + "Min: " + br.lines().collect(Collectors.minBy(comparator))); + br.reset(); + //Compute average value and sum with Collectors.toDoubleSummaryStatistics(...) + DoubleSummaryStatistics doubleSummaryStatistics + = br.lines().collect( + Collectors.summarizingDouble( + str -> parseDouble(getCell(str, column)))); + System.out.println("Average: " + doubleSummaryStatistics.getAverage()); + System.out.println("Sum: " + doubleSummaryStatistics.getSum()); + } + + private static void verifyArgumentNumber(String[] args, int n) { + if (args.length != n) { + printUsageAndExit("Expected " + n + " arguments but was " + + args.length); + } + } + + private static int getColumnNumber(List header, String name) { + int column = header.indexOf(name); + if (column == -1) { + printUsageAndExit("There is no column with name " + name); + } + return column; + } + + private static String getCell(String record, int column) { + return record.split(",")[column].trim(); + } + + private static void printUsageAndExit(String... str) { + System.out.println("Usages:"); + + System.out.println("CSVProcessor sort COLUMN_NAME ASC|DESC FILE"); + System.out.println("Sort lines by column COLUMN_NAME in CSV FILE\n"); + + System.out.println("CSVProcessor search COLUMN_NAME REGEX FILE"); + System.out.println("Search for REGEX in column COLUMN_NAME in CSV FILE\n"); + + System.out.println("CSVProcessor groupby COLUMN_NAME FILE"); + System.out.println("Split lines into different groups according column " + + "COLUMN_NAME value\n"); + + System.out.println("CSVProcessor stat COLUMN_NAME FILE"); + System.out.println("Compute max/min/average/sum statistics by column " + + "COLUMN_NAME\n"); + + Arrays.asList(str).forEach(System.out::println); + System.exit(1); + } + + /* + * This is custom implementation of Collector interface. + * Statitics objects gather max,min,sum,average statistics. + */ + private static class Statistics + implements Collector { + + + /* + * @implNote This implementation is not thread safe. + * However, it is safe to use Statistics on a parallel stream, because + * the parallel implementation of + * {@link java.util.stream.Stream#collect Stream.collect()} + * provides the necessary partitioning, isolation, and merging of results for + * safe and efficient parallel execution. + */ + private String maxRecord; + private String minRecord; + + private double sum; + private int lineCount; + private final BinaryOperator maxOperator; + private final BinaryOperator minOperator; + private final int column; + + public Statistics(int column) { + this.column = column; + Comparator cmp = Comparator.comparing( + (String str) -> parseDouble(getCell(str, column))); + maxOperator = BinaryOperator.maxBy(cmp); + minOperator = BinaryOperator.minBy(cmp); + } + + /* + * Process line + */ + public Statistics accept(String line) { + maxRecord = maxRecord == null + ? line : maxOperator.apply(maxRecord, line); + minRecord = minRecord == null + ? line : minOperator.apply(minRecord, line); + + sum += parseDouble(getCell(line, column)); + lineCount++; + return this; + } + + + /* + * Merge two Statistics + */ + public Statistics combine(Statistics stat) { + maxRecord = maxOperator.apply(maxRecord, stat.getMaxRecord()); + minRecord = minOperator.apply(minRecord, stat.getMinRecord()); + sum += stat.getSum(); + lineCount += stat.getLineCount(); + return this; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("#------Statistics------#\n"); + sb.append("Max: ").append(getMaxRecord()).append("\n"); + sb.append("Min: ").append(getMinRecord()).append("\n"); + sb.append("Sum = ").append(getSum()).append("\n"); + sb.append("Average = ").append(average()).append("\n"); + sb.append("#------Statistics------#\n"); + return sb.toString(); + } + + @Override + public Supplier supplier() { + return () -> new Statistics(column); + } + + @Override + public BiConsumer accumulator() { + return Statistics::accept; + } + + @Override + public BinaryOperator combiner() { + return Statistics::combine; + + } + + @Override + public Function finisher() { + return stat -> stat; + } + + @Override + public Set characteristics() { + return EnumSet.of(Characteristics.IDENTITY_FINISH); + } + + private String getMaxRecord() { + return maxRecord; + } + + private String getMinRecord() { + return minRecord; + } + + private double getSum() { + return sum; + } + + private double average() { + return sum / lineCount; + } + + private int getLineCount() { + return lineCount; + } + + } + +}