1 /*
   2  * Copyright (c) 2017, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package build.tools.publicsuffixlist;
  27 
  28 import java.io.BufferedReader;
  29 import java.io.BufferedWriter;
  30 import java.io.FileInputStream;
  31 import java.io.FileOutputStream;
  32 import java.io.InputStreamReader;
  33 import java.io.IOException;
  34 import java.io.OutputStreamWriter;
  35 import java.nio.file.attribute.FileTime;
  36 import java.util.HashMap;
  37 import java.util.LinkedList;
  38 import java.util.List;
  39 import java.util.Map;
  40 import java.util.Set;
  41 import java.util.regex.Pattern;
  42 import java.util.stream.Collectors;
  43 import java.util.zip.ZipEntry;
  44 import java.util.zip.ZipOutputStream;
  45 
  46 /**
  47  * This tool takes the original Mozilla public suffix rule list as input
  48  * and slices it into a set of files, one for each top-level domain.
  49  * Each file contains only the rules for that domain. Lines containing comments
  50  * or only whitespace are not copied. Each of these files are then combined
  51  * into the target zipfile.
  52  *
  53  * Usage: java GeneratePublicSuffixList mozilla_file destination_zipfile
  54  */
  55 public final class GeneratePublicSuffixList {
  56     // patterns
  57     private static final String COMMENT = "//";
  58     private static final String BEGIN_PRIVATE = "// ===BEGIN PRIVATE DOMAINS===";
  59     private static final Pattern WHITESPACE = Pattern.compile("\\s*");
  60     private static final byte ICANN = 0x00;
  61     private static final byte PRIVATE = 0x01;
  62 
  63     private static class Domain {
  64         final String name;
  65         final byte type;
  66         Domain(String name, byte type) {
  67             this.name = name;
  68             this.type = type;
  69         }
  70     }
  71 
  72     public static void main(String[] args) throws Exception {
  73         if (args.length != 2) {
  74             throw new Exception("2 args required: input_file output_file");
  75         }
  76         try (FileInputStream fis = new FileInputStream(args[0]);
  77              ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(args[1])))
  78         {
  79             BufferedReader br =
  80                 new BufferedReader(new InputStreamReader(fis, "UTF-8"));
  81 
  82             List<Domain> domains = new LinkedList<>();
  83             byte type = ICANN;
  84             String line;
  85             while ((line = br.readLine()) != null) {
  86                 if (line.startsWith(COMMENT)) {
  87                     if (line.startsWith(BEGIN_PRIVATE)) {
  88                         type = PRIVATE;
  89                     }
  90                     continue;
  91                 }
  92                 if (WHITESPACE.matcher(line).matches()) {
  93                     continue;
  94                 }
  95                 domains.add(new Domain(line, type));
  96             }
  97             // have a list of rules now
  98 
  99             // Map of TLD names to rules with the same TLD
 100             Map<String, List<Domain>> rules = addDomains(domains);
 101 
 102             // stream for writing the file contents
 103             BufferedWriter bw =
 104                 new BufferedWriter(new OutputStreamWriter(zos, "UTF-8"));
 105 
 106             // now output each map entry to its own file,
 107             // whose filename is the TLD
 108             writeRules(zos, bw, rules);
 109         }
 110     }
 111 
 112     private static Map<String, List<Domain>> addDomains(List<Domain> domains) {
 113         Map<String, List<Domain>> rules = new HashMap<>();
 114         for (Domain domain : domains) {
 115             String tld = getTLD(domain.name);
 116 
 117             rules.compute(tld, (k, v) -> {
 118                 if (v == null) {
 119                     List<Domain> newV = new LinkedList<>();
 120                     newV.add(domain);
 121                     return newV;
 122                 } else {
 123                     v.add(domain);
 124                     return v;
 125                 }
 126             });
 127         }
 128         return rules;
 129     }
 130 
 131     private static void writeRules(ZipOutputStream zos, BufferedWriter bw,
 132                                    Map<String, List<Domain>> rules)
 133                                    throws IOException {
 134         // Sort keys for deterministic output
 135         List<String> tlds = rules.keySet().stream().sorted().collect(Collectors.toList());
 136         for (String tld : tlds) {
 137             List<Domain> entries = rules.get(tld);
 138             ZipEntry ze = new ZipEntry(tld);
 139             ze.setLastModifiedTime(FileTime.fromMillis(0));
 140             zos.putNextEntry(ze);
 141             for (Domain entry : entries) {
 142                 bw.write(entry.type);
 143                 bw.write(entry.name, 0, entry.name.length());
 144                 bw.newLine();
 145             }
 146             bw.flush();
 147         }
 148     }
 149 
 150     private static String getTLD(String line) {
 151         int dotIndex = line.lastIndexOf('.');
 152         return (dotIndex == -1) ? line : line.substring(dotIndex + 1);
 153     }
 154 }