optimaize · calou · Jan 14, 2016 · rmtheis · Mar 27, 2016 · calou
diff --git a/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java b/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java
@@ -84,6 +84,7 @@ public LanguageDetectorBuilder affixFactor(double affixFactor) {
         suffixFactor(affixFactor);
         return this;
     }
+
     /**
      * To weight n-grams that are on the left border of a word differently from n-grams
      * in the middle of words, assign a value here.
@@ -98,6 +99,7 @@ public LanguageDetectorBuilder prefixFactor(double prefixFactor) {
         this.prefixFactor = prefixFactor;
         return this;
     }
+
     /**
      * Defaults to 1.0, which means don't use this feature.
      * @param suffixFactor 0.0 to 10.0, a suggested value is 2.0
@@ -154,6 +156,7 @@ public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) thro
         languageProfiles.add(languageProfile);
         return this;
     }
+
     /**
      * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug).
      */

diff --git a/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java b/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java
@@ -128,11 +128,7 @@ public Optional<LdLocale> detect(CharSequence text) {
     @Override
     public List<DetectedLanguage> getProbabilities(CharSequence text) {
         double[] langprob = detectBlock(text);
-        if (langprob==null) {
-            return Collections.emptyList();
-        } else {
-            return sortProbability(langprob);
-        }
+        return langprob==null ? Collections.<DetectedLanguage>emptyList() : sortProbability(langprob);
     }
 
 
@@ -143,12 +139,10 @@ public List<DetectedLanguage> getProbabilities(CharSequence text) {
     private double[] detectBlock(CharSequence text) {
         if (text.length() <= shortTextAlgorithm) {
             Map<String, Integer> ngrams = ngramExtractor.extractCountedGrams(text);
-            if (ngrams.isEmpty()) return null;
-            return detectBlockShortText(ngrams);
+            return ngrams.isEmpty() ? null : detectBlockShortText(ngrams);
         } else {
             List<String> strings = ngramExtractor.extractGrams(text);
-            if (strings.isEmpty()) return null;
-            return detectBlockLongText(strings);
+            return strings.isEmpty() ? null : detectBlockLongText(strings);
         }
     }
 
@@ -171,8 +165,11 @@ private double[] detectBlockShortText(Map<String, Integer> ngrams) {
      */
     private double[] detectBlockLongText(List<String> ngrams) {
         assert !ngrams.isEmpty();
+        final boolean traceEnabled = logger.isTraceEnabled();
+
         double[] langprob = new double[ngramFrequencyData.getLanguageList().size()];
         Random rand = new Random(seed.or(DEFAULT_SEED));
+
         for (int t = 0; t < N_TRIAL; ++t) {
             double[] prob = initProbability();
             double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH);
@@ -182,11 +179,11 @@ private double[] detectBlockLongText(List<String> ngrams) {
                 updateLangProb(prob, ngrams.get(r), 1, alpha);
                 if (i % 5 == 0) {
                     if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this looks like an optimization to return quickly when sure. TODO document what's the plan.
-                    if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob));
+                    if (traceEnabled) logger.trace("> " + sortProbability(prob));
                 }
             }
             for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / N_TRIAL;
-            if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
+            if (traceEnabled) logger.trace("==> " + sortProbability(prob));
         }
         return langprob;
     }
@@ -199,11 +196,11 @@ private double[] detectBlockLongText(List<String> ngrams) {
     private double[] initProbability() {
         double[] prob = new double[ngramFrequencyData.getLanguageList().size()];
         if (priorMap != null) {
-            //TODO analyze and optimize this code, looks like double copy.
             System.arraycopy(priorMap, 0, prob, 0, prob.length);
-            for(int i=0;i<prob.length;++i) prob[i] = priorMap[i];
         } else {
-            for(int i=0;i<prob.length;++i) prob[i] = 1.0 / ngramFrequencyData.getLanguageList().size();
+            for(int i=0;i<prob.length;++i) {
+                prob[i] = 1.0 / ngramFrequencyData.getLanguageList().size();
+            }
         }
         return prob;
     }
@@ -244,19 +241,14 @@ private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, in
      */
     @NotNull
     private List<DetectedLanguage> sortProbability(double[] prob) {
-        List<DetectedLanguage> list = new ArrayList<>();
+        List<DetectedLanguage> list = new ArrayList<>(prob.length);
         for (int j=0;j<prob.length;++j) {
             double p = prob[j];
             if (p >= probabilityThreshold) {
-                for (int i=0; i<=list.size(); ++i) {
-                    if (i == list.size() || list.get(i).getProbability() < p) {
-                        list.add(i, new DetectedLanguage(ngramFrequencyData.getLanguage(j), p));
-                        break;
-                    }
-                }
+                list.add(new DetectedLanguage(ngramFrequencyData.getLanguage(j), p));
             }
         }
+        Collections.sort(list);
         return list;
     }
-
 }
diff --git a/src/main/java/com/optimaize/langdetect/NgramFrequencyData.java b/src/main/java/com/optimaize/langdetect/NgramFrequencyData.java
@@ -46,8 +46,8 @@ public static NgramFrequencyData create(@NotNull Collection<LanguageProfile> lan
         if (gramLengths.isEmpty()) throw new IllegalArgumentException("No gramLengths provided!");
 
         Map<String, double[]> wordLangProbMap = new HashMap<>();
-        List<LdLocale> langlist = new ArrayList<>();
-        int langsize = languageProfiles.size();
+        final int langsize = languageProfiles.size();
+        List<LdLocale> langlist = new ArrayList<>(langsize);
 
         int index = -1;
         for (LanguageProfile profile : languageProfiles) {

diff --git a/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java b/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java
@@ -49,6 +49,7 @@
  */
 public final class LdLocale {
 
+    public static final Splitter DASH_SPLITTER = Splitter.on('-');
     @NotNull
     private final String language;
     @NotNull
@@ -74,7 +75,7 @@ public static LdLocale fromString(@NotNull String string) {
         Optional<String> script = null;
         Optional<String> region = null;
 
-        List<String> strings = Splitter.on('-').splitToList(string);
+        List<String> strings = DASH_SPLITTER.splitToList(string);
         for (int i=0; i<strings.size(); i++) {
             String chunk = strings.get(i);
             if (i==0) {

diff --git a/src/main/java/com/optimaize/langdetect/ngram/BackwardsCompatibleNgramFilter.java b/src/main/java/com/optimaize/langdetect/ngram/BackwardsCompatibleNgramFilter.java
@@ -21,26 +21,11 @@ private BackwardsCompatibleNgramFilter() {
     public boolean use(String ngram) {
         switch (ngram.length()) {
             case 1:
-                if (ngram.charAt(0)==' ') {
-                    return false;
-                }
-                return true;
+                return ngram.charAt(0) != ' ';
             case 2:
-                if (Character.isUpperCase(ngram.charAt(0)) && Character.isUpperCase(ngram.charAt(1))) {
-                    //all upper case
-                    return false;
-                }
-                return true;
+                return !ngram.equals(ngram.toUpperCase());
             case 3:
-                if (Character.isUpperCase(ngram.charAt(0)) && Character.isUpperCase(ngram.charAt(1)) && Character.isUpperCase(ngram.charAt(2))) {
-                    //all upper case
-                    return false;
-                }
-                if (ngram.charAt(1)==' ') {
-                    //middle char is a space
-                    return false;
-                }
-                return true;
+                return ngram.charAt(1) !=' ' && !ngram.equals(ngram.toUpperCase());
             default:
                 throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length());
         }

diff --git a/src/main/java/com/optimaize/langdetect/ngram/NgramExtractor.java b/src/main/java/com/optimaize/langdetect/ngram/NgramExtractor.java
@@ -81,12 +81,12 @@ public List<String> extractGrams(@NotNull CharSequence text) {
             return Collections.emptyList();
         }
         List<String> grams = new ArrayList<>(totalNumGrams);
-
+        String textAsString = text.toString();
         for (Integer gramLength : gramLengths) {
             int numGrams = len - (gramLength -1);
             if (numGrams >= 1) { //yes can be negative
                 for (int pos=0; pos<numGrams; pos++) {
-                    String gram = text.subSequence(pos, pos + gramLength).toString();
+                    String gram = textAsString.substring(pos, pos + gramLength);
                     if (filter==null || filter.use(gram)) {
                         grams.add(gram);
                     }

diff --git a/src/main/java/com/optimaize/langdetect/ngram/OldNgramExtractor.java b/src/main/java/com/optimaize/langdetect/ngram/OldNgramExtractor.java
@@ -46,10 +46,8 @@ public static List<String> extractNGrams(@NotNull CharSequence text, @Nullable F
             ngram.addChar(text.charAt(i));
             for(int n=1;n<=NGram.N_GRAM;++n){
                 String w = ngram.get(n);
-                if (w!=null) { //TODO this null check is ugly
-                    if (filter==null || filter.use(w)) {
-                        list.add(w);
-                    }
+                if (w!=null && (filter==null || filter.use(w))) {
+                    list.add(w);
                 }
             }
         }

diff --git a/src/main/java/com/optimaize/langdetect/ngram/StandardNgramFilter.java b/src/main/java/com/optimaize/langdetect/ngram/StandardNgramFilter.java
@@ -13,35 +13,23 @@ public static NgramFilter getInstance() {
         return INSTANCE;
     }
 
-    private StandardNgramFilter() {
-    }
+    private StandardNgramFilter() {}
 
     @Override
     public boolean use(String ngram) {
         switch (ngram.length()) {
             case 1:
-                if (ngram.charAt(0)==' ') {
-                    return false;
-                }
-                return true;
+                return ngram.charAt(0) != ' ';
             case 2:
                 return true;
             case 3:
-                if (ngram.charAt(1)==' ') {
-                    //middle char is a space
-                    return false;
-                }
-                return true;
+                return ngram.charAt(1) != ' ';
             case 4:
-                if (ngram.charAt(1)==' ' || ngram.charAt(2)==' ') {
-                    //one of the middle chars is a space
-                    return false;
-                }
-                return true;
+                //one of the middle chars is a space
+                return ngram.charAt(1) !=' ' && ngram.charAt(2) !=' ';
             default:
                 //would need the same check: no space in the middle, border is fine.
                 throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length());
         }
     }
-
 }
diff --git a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java
@@ -65,9 +65,9 @@ public Stats(@NotNull Map<Integer, Long> numOccurrences,
     }
 
     private static Stats makeStats(Map<Integer, Map<String, Integer>> ngrams) {
-        Map<Integer, Long> numOccurrences = new HashMap<>(6);
-        Map<Integer, Long> minGramCounts = new HashMap<>(6);
-        Map<Integer, Long> maxGramCounts = new HashMap<>(6);
+        Map<Integer, Long> numOccurrences = new HashMap<>(ngrams.size());
+        Map<Integer, Long> minGramCounts = new HashMap<>(ngrams.size());
+        Map<Integer, Long> maxGramCounts = new HashMap<>(ngrams.size());
         for (Map.Entry<Integer, Map<String, Integer>> entry : ngrams.entrySet()) {
             long count = 0;
             Long min = null;
@@ -81,9 +81,10 @@ private static Stats makeStats(Map<Integer, Map<String, Integer>> ngrams) {
                     max = (long)integer;
                 }
             }
-            numOccurrences.put(entry.getKey(), count);
-            minGramCounts.put(entry.getKey(), min);
-            maxGramCounts.put(entry.getKey(), max);
+            final Integer key = entry.getKey();
+            numOccurrences.put(key, count);
+            minGramCounts.put(key, min);
+            maxGramCounts.put(key, max);
         }
         return new Stats(numOccurrences, minGramCounts, maxGramCounts);
     }

diff --git a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileReader.java b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileReader.java
@@ -97,7 +97,7 @@ private String makeProfileFileName(@NotNull LdLocale locale) {
 
     @NotNull
     public List<LanguageProfile> readBuiltIn(@NotNull Collection<LdLocale> languages) throws IOException {
-        List<String> profileNames = new ArrayList<>();
+        List<String> profileNames = new ArrayList<>(languages.size());
         for (LdLocale locale : languages) {
             profileNames.add(makeProfileFileName(locale));
         }
@@ -110,12 +110,14 @@ public List<LanguageProfile> readBuiltIn(@NotNull Collection<LdLocale> languages
     public List<LanguageProfile> readAll() throws IOException {
         return readAllBuiltIn();
     }
+
     /**
      * Reads all built-in language profiles from the "languages" folder (shipped with the jar).
      */
     public List<LanguageProfile> readAllBuiltIn() throws IOException {
-        List<LanguageProfile> loaded = new ArrayList<>();
-        for (LdLocale locale : BuiltInLanguages.getLanguages()) {
+        final List<LdLocale> languages = BuiltInLanguages.getLanguages();
+        List<LanguageProfile> loaded = new ArrayList<>(languages.size());
+        for (LdLocale locale : languages) {
             loaded.add(readBuiltIn(locale));
         }
         return loaded;
@@ -148,20 +150,17 @@ public boolean accept(File pathname) {
 
         List<LanguageProfile> profiles = new ArrayList<>(listFiles.length);
         for (File file: listFiles) {
-            if (!looksLikeLanguageProfileFile(file)) {
-                continue;
+            if (looksLikeLanguageProfileFile(file)) {
+                profiles.add(read(file));
             }
-            profiles.add(read(file));
         }
         return profiles;
     }
 
     private boolean looksLikeLanguageProfileFile(File file) {
-        if (!file.isFile()) {
-            return false;
-        }
-        return looksLikeLanguageProfileName(file.getName());
+        return file.isFile() ? looksLikeLanguageProfileName(file.getName()) : false;
     }
+
     private boolean looksLikeLanguageProfileName(String fileName) {
         if (fileName.contains(".")) {
             return false;
@@ -173,5 +172,4 @@ private boolean looksLikeLanguageProfileName(String fileName) {
             return false;
         }
     }
-
 }
diff --git a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileWriter.java b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileWriter.java
@@ -28,12 +28,13 @@ public void write(@NotNull LanguageProfile languageProfile, @NotNull OutputStrea
             for (Map.Entry<String, Integer> entry : languageProfile.iterateGrams()) {
                 if (!first) {
                     writer.write(',');
+                } else {
+                    first = false;
                 }
                 writer.write('"');
                 writer.write(entry.getKey());
                 writer.write("\":");
                 writer.write(entry.getValue().toString());
-                first = false;
             }
             writer.write("},\"n_words\":[");
             first = true;

diff --git a/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java b/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java
@@ -20,11 +20,7 @@ public class MultiTextFilter implements TextFilter {
      * @param filters may be empty by definition
      */
     public MultiTextFilter(@NotNull List<TextFilter> filters) {
-        if (filters.isEmpty()) {
-            this.filters = null;
-        } else {
-            this.filters = ImmutableList.copyOf(filters);
-        }
+        this.filters = filters.isEmpty() ? null : ImmutableList.copyOf(filters);
     }
 
     @Override

diff --git a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java
@@ -112,11 +112,7 @@ private Map<Character.UnicodeScript, Long> countByScript(CharSequence text) {
     }
     private void increment(Map<Character.UnicodeScript, Long> counter, Character.UnicodeScript unicodeScript) {
         Long number = counter.get(unicodeScript);
-        if (number==null) {
-            counter.put(unicodeScript, 1L);
-        } else {
-            counter.put(unicodeScript, number+1);
-        }
+        counter.put(unicodeScript, number == null ? 1L : number+1);
     }
 
 }
diff --git a/src/main/java/com/optimaize/langdetect/text/TextObject.java b/src/main/java/com/optimaize/langdetect/text/TextObject.java
@@ -40,7 +40,6 @@ public class TextObject implements CharSequence, Appendable {
 
     private final int maxTextLength;
 
-
     /**
      * @param maxTextLength 0 for no limit
      */
-Original file line number
+Diff line change
@@ Expand Up / @@ -40,7 +40,6 @@ public class TextObject implements CharSequence, Appendable { @@
         private final int maxTextLength;
         /**
          * @param maxTextLength 0 for no limit
          */
@@ Expand Down @@