From 18f70dd17f89a45ded719a7ae1ecd1da3c4e8014 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Gruchet?= Date: Thu, 14 Jan 2016 07:39:29 +0100 Subject: [PATCH] Set of minor optimizations/refactoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien Gruchet --- .../langdetect/LanguageDetectorBuilder.java | 3 ++ .../langdetect/LanguageDetectorImpl.java | 36 +++++-------- .../langdetect/NgramFrequencyData.java | 4 +- .../optimaize/langdetect/i18n/LdLocale.java | 3 +- .../ngram/BackwardsCompatibleNgramFilter.java | 21 ++------ .../langdetect/ngram/NgramExtractor.java | 4 +- .../langdetect/ngram/OldNgramExtractor.java | 6 +-- .../langdetect/ngram/StandardNgramFilter.java | 22 ++------ .../profiles/LanguageProfileImpl.java | 13 ++--- .../profiles/LanguageProfileReader.java | 20 ++++--- .../profiles/LanguageProfileWriter.java | 3 +- .../langdetect/text/MultiTextFilter.java | 6 +-- .../text/RemoveMinorityScriptsTextFilter.java | 6 +-- .../optimaize/langdetect/text/TextObject.java | 1 - .../langdetect/LanguageDetectorImplTest.java | 54 ++++++++++++------- 15 files changed, 88 insertions(+), 114 deletions(-) diff --git a/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java b/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java index b199801..80a9af1 100644 --- a/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java +++ b/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java @@ -84,6 +84,7 @@ public LanguageDetectorBuilder affixFactor(double affixFactor) { suffixFactor(affixFactor); return this; } + /** * To weight n-grams that are on the left border of a word differently from n-grams * in the middle of words, assign a value here. @@ -98,6 +99,7 @@ public LanguageDetectorBuilder prefixFactor(double prefixFactor) { this.prefixFactor = prefixFactor; return this; } + /** * Defaults to 1.0, which means don't use this feature. * @param suffixFactor 0.0 to 10.0, a suggested value is 2.0 @@ -154,6 +156,7 @@ public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) thro languageProfiles.add(languageProfile); return this; } + /** * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug). */ diff --git a/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java b/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java index 012720a..f37e635 100644 --- a/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java +++ b/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java @@ -128,11 +128,7 @@ public Optional detect(CharSequence text) { @Override public List getProbabilities(CharSequence text) { double[] langprob = detectBlock(text); - if (langprob==null) { - return Collections.emptyList(); - } else { - return sortProbability(langprob); - } + return langprob==null ? Collections.emptyList() : sortProbability(langprob); } @@ -143,12 +139,10 @@ public List getProbabilities(CharSequence text) { private double[] detectBlock(CharSequence text) { if (text.length() <= shortTextAlgorithm) { Map ngrams = ngramExtractor.extractCountedGrams(text); - if (ngrams.isEmpty()) return null; - return detectBlockShortText(ngrams); + return ngrams.isEmpty() ? null : detectBlockShortText(ngrams); } else { List strings = ngramExtractor.extractGrams(text); - if (strings.isEmpty()) return null; - return detectBlockLongText(strings); + return strings.isEmpty() ? null : detectBlockLongText(strings); } } @@ -171,8 +165,11 @@ private double[] detectBlockShortText(Map ngrams) { */ private double[] detectBlockLongText(List ngrams) { assert !ngrams.isEmpty(); + final boolean traceEnabled = logger.isTraceEnabled(); + double[] langprob = new double[ngramFrequencyData.getLanguageList().size()]; Random rand = new Random(seed.or(DEFAULT_SEED)); + for (int t = 0; t < N_TRIAL; ++t) { double[] prob = initProbability(); double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH); @@ -182,11 +179,11 @@ private double[] detectBlockLongText(List ngrams) { updateLangProb(prob, ngrams.get(r), 1, alpha); if (i % 5 == 0) { if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this looks like an optimization to return quickly when sure. TODO document what's the plan. - if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob)); + if (traceEnabled) logger.trace("> " + sortProbability(prob)); } } for(int j=0;j " + sortProbability(prob)); + if (traceEnabled) logger.trace("==> " + sortProbability(prob)); } return langprob; } @@ -199,11 +196,11 @@ private double[] detectBlockLongText(List ngrams) { private double[] initProbability() { double[] prob = new double[ngramFrequencyData.getLanguageList().size()]; if (priorMap != null) { - //TODO analyze and optimize this code, looks like double copy. System.arraycopy(priorMap, 0, prob, 0, prob.length); - for(int i=0;i sortProbability(double[] prob) { - List list = new ArrayList<>(); + List list = new ArrayList<>(prob.length); for (int j=0;j= probabilityThreshold) { - for (int i=0; i<=list.size(); ++i) { - if (i == list.size() || list.get(i).getProbability() < p) { - list.add(i, new DetectedLanguage(ngramFrequencyData.getLanguage(j), p)); - break; - } - } + list.add(new DetectedLanguage(ngramFrequencyData.getLanguage(j), p)); } } + Collections.sort(list); return list; } - } diff --git a/src/main/java/com/optimaize/langdetect/NgramFrequencyData.java b/src/main/java/com/optimaize/langdetect/NgramFrequencyData.java index b09988d..ebf453a 100644 --- a/src/main/java/com/optimaize/langdetect/NgramFrequencyData.java +++ b/src/main/java/com/optimaize/langdetect/NgramFrequencyData.java @@ -46,8 +46,8 @@ public static NgramFrequencyData create(@NotNull Collection lan if (gramLengths.isEmpty()) throw new IllegalArgumentException("No gramLengths provided!"); Map wordLangProbMap = new HashMap<>(); - List langlist = new ArrayList<>(); - int langsize = languageProfiles.size(); + final int langsize = languageProfiles.size(); + List langlist = new ArrayList<>(langsize); int index = -1; for (LanguageProfile profile : languageProfiles) { diff --git a/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java b/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java index 672fbf8..be8a14f 100644 --- a/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java +++ b/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java @@ -49,6 +49,7 @@ */ public final class LdLocale { + public static final Splitter DASH_SPLITTER = Splitter.on('-'); @NotNull private final String language; @NotNull @@ -74,7 +75,7 @@ public static LdLocale fromString(@NotNull String string) { Optional script = null; Optional region = null; - List strings = Splitter.on('-').splitToList(string); + List strings = DASH_SPLITTER.splitToList(string); for (int i=0; i extractGrams(@NotNull CharSequence text) { return Collections.emptyList(); } List grams = new ArrayList<>(totalNumGrams); - + String textAsString = text.toString(); for (Integer gramLength : gramLengths) { int numGrams = len - (gramLength -1); if (numGrams >= 1) { //yes can be negative for (int pos=0; pos extractNGrams(@NotNull CharSequence text, @Nullable F ngram.addChar(text.charAt(i)); for(int n=1;n<=NGram.N_GRAM;++n){ String w = ngram.get(n); - if (w!=null) { //TODO this null check is ugly - if (filter==null || filter.use(w)) { - list.add(w); - } + if (w!=null && (filter==null || filter.use(w))) { + list.add(w); } } } diff --git a/src/main/java/com/optimaize/langdetect/ngram/StandardNgramFilter.java b/src/main/java/com/optimaize/langdetect/ngram/StandardNgramFilter.java index 9245f58..b236f82 100644 --- a/src/main/java/com/optimaize/langdetect/ngram/StandardNgramFilter.java +++ b/src/main/java/com/optimaize/langdetect/ngram/StandardNgramFilter.java @@ -13,35 +13,23 @@ public static NgramFilter getInstance() { return INSTANCE; } - private StandardNgramFilter() { - } + private StandardNgramFilter() {} @Override public boolean use(String ngram) { switch (ngram.length()) { case 1: - if (ngram.charAt(0)==' ') { - return false; - } - return true; + return ngram.charAt(0) != ' '; case 2: return true; case 3: - if (ngram.charAt(1)==' ') { - //middle char is a space - return false; - } - return true; + return ngram.charAt(1) != ' '; case 4: - if (ngram.charAt(1)==' ' || ngram.charAt(2)==' ') { - //one of the middle chars is a space - return false; - } - return true; + //one of the middle chars is a space + return ngram.charAt(1) !=' ' && ngram.charAt(2) !=' '; default: //would need the same check: no space in the middle, border is fine. throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length()); } } - } diff --git a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java index a4c494b..3e0de4c 100644 --- a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java +++ b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java @@ -65,9 +65,9 @@ public Stats(@NotNull Map numOccurrences, } private static Stats makeStats(Map> ngrams) { - Map numOccurrences = new HashMap<>(6); - Map minGramCounts = new HashMap<>(6); - Map maxGramCounts = new HashMap<>(6); + Map numOccurrences = new HashMap<>(ngrams.size()); + Map minGramCounts = new HashMap<>(ngrams.size()); + Map maxGramCounts = new HashMap<>(ngrams.size()); for (Map.Entry> entry : ngrams.entrySet()) { long count = 0; Long min = null; @@ -81,9 +81,10 @@ private static Stats makeStats(Map> ngrams) { max = (long)integer; } } - numOccurrences.put(entry.getKey(), count); - minGramCounts.put(entry.getKey(), min); - maxGramCounts.put(entry.getKey(), max); + final Integer key = entry.getKey(); + numOccurrences.put(key, count); + minGramCounts.put(key, min); + maxGramCounts.put(key, max); } return new Stats(numOccurrences, minGramCounts, maxGramCounts); } diff --git a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileReader.java b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileReader.java index 8314a33..b1953a1 100644 --- a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileReader.java +++ b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileReader.java @@ -97,7 +97,7 @@ private String makeProfileFileName(@NotNull LdLocale locale) { @NotNull public List readBuiltIn(@NotNull Collection languages) throws IOException { - List profileNames = new ArrayList<>(); + List profileNames = new ArrayList<>(languages.size()); for (LdLocale locale : languages) { profileNames.add(makeProfileFileName(locale)); } @@ -110,12 +110,14 @@ public List readBuiltIn(@NotNull Collection languages public List readAll() throws IOException { return readAllBuiltIn(); } + /** * Reads all built-in language profiles from the "languages" folder (shipped with the jar). */ public List readAllBuiltIn() throws IOException { - List loaded = new ArrayList<>(); - for (LdLocale locale : BuiltInLanguages.getLanguages()) { + final List languages = BuiltInLanguages.getLanguages(); + List loaded = new ArrayList<>(languages.size()); + for (LdLocale locale : languages) { loaded.add(readBuiltIn(locale)); } return loaded; @@ -148,20 +150,17 @@ public boolean accept(File pathname) { List profiles = new ArrayList<>(listFiles.length); for (File file: listFiles) { - if (!looksLikeLanguageProfileFile(file)) { - continue; + if (looksLikeLanguageProfileFile(file)) { + profiles.add(read(file)); } - profiles.add(read(file)); } return profiles; } private boolean looksLikeLanguageProfileFile(File file) { - if (!file.isFile()) { - return false; - } - return looksLikeLanguageProfileName(file.getName()); + return file.isFile() ? looksLikeLanguageProfileName(file.getName()) : false; } + private boolean looksLikeLanguageProfileName(String fileName) { if (fileName.contains(".")) { return false; @@ -173,5 +172,4 @@ private boolean looksLikeLanguageProfileName(String fileName) { return false; } } - } diff --git a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileWriter.java b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileWriter.java index 0c691b1..1425898 100644 --- a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileWriter.java +++ b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileWriter.java @@ -28,12 +28,13 @@ public void write(@NotNull LanguageProfile languageProfile, @NotNull OutputStrea for (Map.Entry entry : languageProfile.iterateGrams()) { if (!first) { writer.write(','); + } else { + first = false; } writer.write('"'); writer.write(entry.getKey()); writer.write("\":"); writer.write(entry.getValue().toString()); - first = false; } writer.write("},\"n_words\":["); first = true; diff --git a/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java b/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java index 36924e3..7826b40 100644 --- a/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java +++ b/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java @@ -20,11 +20,7 @@ public class MultiTextFilter implements TextFilter { * @param filters may be empty by definition */ public MultiTextFilter(@NotNull List filters) { - if (filters.isEmpty()) { - this.filters = null; - } else { - this.filters = ImmutableList.copyOf(filters); - } + this.filters = filters.isEmpty() ? null : ImmutableList.copyOf(filters); } @Override diff --git a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java index 047ae2f..94a3d9a 100644 --- a/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java +++ b/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java @@ -112,11 +112,7 @@ private Map countByScript(CharSequence text) { } private void increment(Map counter, Character.UnicodeScript unicodeScript) { Long number = counter.get(unicodeScript); - if (number==null) { - counter.put(unicodeScript, 1L); - } else { - counter.put(unicodeScript, number+1); - } + counter.put(unicodeScript, number == null ? 1L : number+1); } } diff --git a/src/main/java/com/optimaize/langdetect/text/TextObject.java b/src/main/java/com/optimaize/langdetect/text/TextObject.java index 9e43a54..270818e 100644 --- a/src/main/java/com/optimaize/langdetect/text/TextObject.java +++ b/src/main/java/com/optimaize/langdetect/text/TextObject.java @@ -40,7 +40,6 @@ public class TextObject implements CharSequence, Appendable { private final int maxTextLength; - /** * @param maxTextLength 0 for no limit */ diff --git a/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java b/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java index ed39b01..f3871be 100644 --- a/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java +++ b/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java @@ -7,6 +7,7 @@ import com.optimaize.langdetect.profiles.LanguageProfile; import com.optimaize.langdetect.profiles.OldLangProfileConverter; import com.optimaize.langdetect.text.*; +import org.junit.Before; import org.junit.Test; import java.io.IOException; @@ -21,18 +22,50 @@ */ public class LanguageDetectorImplTest { + private LanguageDetector languageDetector; + + @Before + public void setUp() throws IOException { + LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()); + builder.shortTextAlgorithm(50) + .prefixFactor(1.5) + .suffixFactor(2.0); + + LangProfileReader langProfileReader = new LangProfileReader(); + for (String language : ImmutableList.of("en", "fr", "nl", "de")) { + LangProfile langProfile = langProfileReader.read(LanguageDetectorImplTest.class.getResourceAsStream("/languages/" + language)); + LanguageProfile languageProfile = OldLangProfileConverter.convert(langProfile); + builder.withProfile(languageProfile); + } + languageDetector = builder.build(); + } + @Test public void german() throws IOException { - LanguageDetector languageDetector = makeNewDetector(); List result = languageDetector.getProbabilities("Dies ist eine deutsche Text"); DetectedLanguage best = result.get(0); assertEquals(best.getLocale().getLanguage(), "de"); assertTrue(best.getProbability() >= 0.9999d); } + @Test + public void french() throws IOException { + List result = languageDetector.getProbabilities("Ceci est un texte en français"); + DetectedLanguage best = result.get(0); + assertEquals(best.getLocale().getLanguage(), "fr"); + assertTrue(best.getProbability() >= 0.9999d); + } + + @Test + public void dutch() throws IOException { + List result = languageDetector.getProbabilities("Dit is wat tekst"); + DetectedLanguage best = result.get(0); + assertEquals(best.getLocale().getLanguage(), "nl"); + assertTrue(best.getProbability() >= 0.9999d); + } + @Test public void germanShort() throws IOException { - LanguageDetector languageDetector = makeNewDetector(); List result = languageDetector.getProbabilities("deutsche Text"); DetectedLanguage best = result.get(0); assertEquals(best.getLocale().getLanguage(), "de"); @@ -44,27 +77,10 @@ public void germanShortWithUrl() throws IOException { TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); TextObject inputText = textObjectFactory.create().append("deutsche Text").append(" ").append("http://www.github.com/"); - LanguageDetector languageDetector = makeNewDetector(); List result = languageDetector.getProbabilities(inputText); DetectedLanguage best = result.get(0); assertEquals(best.getLocale().getLanguage(), "de"); assertTrue(best.getProbability() >= 0.9999d); } - private LanguageDetector makeNewDetector() throws IOException { - LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()); - builder.shortTextAlgorithm(50); - builder.prefixFactor(1.5); - builder.suffixFactor(2.0); - - LangProfileReader langProfileReader = new LangProfileReader(); - for (String language : ImmutableList.of("en", "fr", "nl", "de")) { - LangProfile langProfile = langProfileReader.read(LanguageDetectorImplTest.class.getResourceAsStream("/languages/" + language)); - LanguageProfile languageProfile = OldLangProfileConverter.convert(langProfile); - builder.withProfile(languageProfile); - } - - return builder.build(); - } - }