Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ public LanguageDetectorBuilder affixFactor(double affixFactor) {
suffixFactor(affixFactor);
return this;
}

/**
* To weight n-grams that are on the left border of a word differently from n-grams
* in the middle of words, assign a value here.
Expand All @@ -98,6 +99,7 @@ public LanguageDetectorBuilder prefixFactor(double prefixFactor) {
this.prefixFactor = prefixFactor;
return this;
}

/**
* Defaults to 1.0, which means don't use this feature.
* @param suffixFactor 0.0 to 10.0, a suggested value is 2.0
Expand Down Expand Up @@ -154,6 +156,7 @@ public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) thro
languageProfiles.add(languageProfile);
return this;
}

/**
* @throws IllegalStateException if a profile for the same language was added already (must be a userland bug).
*/
Expand Down
36 changes: 14 additions & 22 deletions src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,7 @@ public Optional<LdLocale> detect(CharSequence text) {
@Override
public List<DetectedLanguage> getProbabilities(CharSequence text) {
double[] langprob = detectBlock(text);
if (langprob==null) {
return Collections.emptyList();
} else {
return sortProbability(langprob);
}
return langprob==null ? Collections.<DetectedLanguage>emptyList() : sortProbability(langprob);
}


Expand All @@ -143,12 +139,10 @@ public List<DetectedLanguage> getProbabilities(CharSequence text) {
private double[] detectBlock(CharSequence text) {
if (text.length() <= shortTextAlgorithm) {
Map<String, Integer> ngrams = ngramExtractor.extractCountedGrams(text);
if (ngrams.isEmpty()) return null;
return detectBlockShortText(ngrams);
return ngrams.isEmpty() ? null : detectBlockShortText(ngrams);
} else {
List<String> strings = ngramExtractor.extractGrams(text);
if (strings.isEmpty()) return null;
return detectBlockLongText(strings);
return strings.isEmpty() ? null : detectBlockLongText(strings);
}
}

Expand All @@ -171,8 +165,11 @@ private double[] detectBlockShortText(Map<String, Integer> ngrams) {
*/
private double[] detectBlockLongText(List<String> ngrams) {
assert !ngrams.isEmpty();
final boolean traceEnabled = logger.isTraceEnabled();

double[] langprob = new double[ngramFrequencyData.getLanguageList().size()];
Random rand = new Random(seed.or(DEFAULT_SEED));

for (int t = 0; t < N_TRIAL; ++t) {
double[] prob = initProbability();
double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH);
Expand All @@ -182,11 +179,11 @@ private double[] detectBlockLongText(List<String> ngrams) {
updateLangProb(prob, ngrams.get(r), 1, alpha);
if (i % 5 == 0) {
if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this looks like an optimization to return quickly when sure. TODO document what's the plan.
if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob));
if (traceEnabled) logger.trace("> " + sortProbability(prob));
}
}
for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / N_TRIAL;
if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
if (traceEnabled) logger.trace("==> " + sortProbability(prob));
}
return langprob;
}
Expand All @@ -199,11 +196,11 @@ private double[] detectBlockLongText(List<String> ngrams) {
private double[] initProbability() {
double[] prob = new double[ngramFrequencyData.getLanguageList().size()];
if (priorMap != null) {
//TODO analyze and optimize this code, looks like double copy.
System.arraycopy(priorMap, 0, prob, 0, prob.length);
for(int i=0;i<prob.length;++i) prob[i] = priorMap[i];
} else {
for(int i=0;i<prob.length;++i) prob[i] = 1.0 / ngramFrequencyData.getLanguageList().size();
for(int i=0;i<prob.length;++i) {
prob[i] = 1.0 / ngramFrequencyData.getLanguageList().size();
}
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the removal of this line is intentional, you should update the Javadoc for this method too.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lines 203 and 204 do exactly the same job ie copying the priorMap array in the prob array. I don't think there just should be any update to the Javadoc for search a change.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On taking a fresh look at this, I think you're right and no update is needed.

return prob;
}
Expand Down Expand Up @@ -244,19 +241,14 @@ private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, in
*/
@NotNull
private List<DetectedLanguage> sortProbability(double[] prob) {
List<DetectedLanguage> list = new ArrayList<>();
List<DetectedLanguage> list = new ArrayList<>(prob.length);
for (int j=0;j<prob.length;++j) {
double p = prob[j];
if (p >= probabilityThreshold) {
for (int i=0; i<=list.size(); ++i) {
if (i == list.size() || list.get(i).getProbability() < p) {
list.add(i, new DetectedLanguage(ngramFrequencyData.getLanguage(j), p));
break;
}
}
list.add(new DetectedLanguage(ngramFrequencyData.getLanguage(j), p));
}
}
Collections.sort(list);
return list;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ public static NgramFrequencyData create(@NotNull Collection<LanguageProfile> lan
if (gramLengths.isEmpty()) throw new IllegalArgumentException("No gramLengths provided!");

Map<String, double[]> wordLangProbMap = new HashMap<>();
List<LdLocale> langlist = new ArrayList<>();
int langsize = languageProfiles.size();
final int langsize = languageProfiles.size();
List<LdLocale> langlist = new ArrayList<>(langsize);

int index = -1;
for (LanguageProfile profile : languageProfiles) {
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/com/optimaize/langdetect/i18n/LdLocale.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
*/
public final class LdLocale {

public static final Splitter DASH_SPLITTER = Splitter.on('-');
@NotNull
private final String language;
@NotNull
Expand All @@ -74,7 +75,7 @@ public static LdLocale fromString(@NotNull String string) {
Optional<String> script = null;
Optional<String> region = null;

List<String> strings = Splitter.on('-').splitToList(string);
List<String> strings = DASH_SPLITTER.splitToList(string);
for (int i=0; i<strings.size(); i++) {
String chunk = strings.get(i);
if (i==0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,11 @@ private BackwardsCompatibleNgramFilter() {
public boolean use(String ngram) {
switch (ngram.length()) {
case 1:
if (ngram.charAt(0)==' ') {
return false;
}
return true;
return ngram.charAt(0) != ' ';
case 2:
if (Character.isUpperCase(ngram.charAt(0)) && Character.isUpperCase(ngram.charAt(1))) {
//all upper case
return false;
}
return true;
return !ngram.equals(ngram.toUpperCase());
case 3:
if (Character.isUpperCase(ngram.charAt(0)) && Character.isUpperCase(ngram.charAt(1)) && Character.isUpperCase(ngram.charAt(2))) {
//all upper case
return false;
}
if (ngram.charAt(1)==' ') {
//middle char is a space
return false;
}
return true;
return ngram.charAt(1) !=' ' && !ngram.equals(ngram.toUpperCase());
default:
throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ public List<String> extractGrams(@NotNull CharSequence text) {
return Collections.emptyList();
}
List<String> grams = new ArrayList<>(totalNumGrams);

String textAsString = text.toString();
for (Integer gramLength : gramLengths) {
int numGrams = len - (gramLength -1);
if (numGrams >= 1) { //yes can be negative
for (int pos=0; pos<numGrams; pos++) {
String gram = text.subSequence(pos, pos + gramLength).toString();
String gram = textAsString.substring(pos, pos + gramLength);
if (filter==null || filter.use(gram)) {
grams.add(gram);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@ public static List<String> extractNGrams(@NotNull CharSequence text, @Nullable F
ngram.addChar(text.charAt(i));
for(int n=1;n<=NGram.N_GRAM;++n){
String w = ngram.get(n);
if (w!=null) { //TODO this null check is ugly
if (filter==null || filter.use(w)) {
list.add(w);
}
if (w!=null && (filter==null || filter.use(w))) {
list.add(w);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,35 +13,23 @@ public static NgramFilter getInstance() {
return INSTANCE;
}

private StandardNgramFilter() {
}
private StandardNgramFilter() {}

@Override
public boolean use(String ngram) {
switch (ngram.length()) {
case 1:
if (ngram.charAt(0)==' ') {
return false;
}
return true;
return ngram.charAt(0) != ' ';
case 2:
return true;
case 3:
if (ngram.charAt(1)==' ') {
//middle char is a space
return false;
}
return true;
return ngram.charAt(1) != ' ';
case 4:
if (ngram.charAt(1)==' ' || ngram.charAt(2)==' ') {
//one of the middle chars is a space
return false;
}
return true;
//one of the middle chars is a space
return ngram.charAt(1) !=' ' && ngram.charAt(2) !=' ';
default:
//would need the same check: no space in the middle, border is fine.
throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length());
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ public Stats(@NotNull Map<Integer, Long> numOccurrences,
}

private static Stats makeStats(Map<Integer, Map<String, Integer>> ngrams) {
Map<Integer, Long> numOccurrences = new HashMap<>(6);
Map<Integer, Long> minGramCounts = new HashMap<>(6);
Map<Integer, Long> maxGramCounts = new HashMap<>(6);
Map<Integer, Long> numOccurrences = new HashMap<>(ngrams.size());
Map<Integer, Long> minGramCounts = new HashMap<>(ngrams.size());
Map<Integer, Long> maxGramCounts = new HashMap<>(ngrams.size());
for (Map.Entry<Integer, Map<String, Integer>> entry : ngrams.entrySet()) {
long count = 0;
Long min = null;
Expand All @@ -81,9 +81,10 @@ private static Stats makeStats(Map<Integer, Map<String, Integer>> ngrams) {
max = (long)integer;
}
}
numOccurrences.put(entry.getKey(), count);
minGramCounts.put(entry.getKey(), min);
maxGramCounts.put(entry.getKey(), max);
final Integer key = entry.getKey();
numOccurrences.put(key, count);
minGramCounts.put(key, min);
maxGramCounts.put(key, max);
}
return new Stats(numOccurrences, minGramCounts, maxGramCounts);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ private String makeProfileFileName(@NotNull LdLocale locale) {

@NotNull
public List<LanguageProfile> readBuiltIn(@NotNull Collection<LdLocale> languages) throws IOException {
List<String> profileNames = new ArrayList<>();
List<String> profileNames = new ArrayList<>(languages.size());
for (LdLocale locale : languages) {
profileNames.add(makeProfileFileName(locale));
}
Expand All @@ -110,12 +110,14 @@ public List<LanguageProfile> readBuiltIn(@NotNull Collection<LdLocale> languages
public List<LanguageProfile> readAll() throws IOException {
return readAllBuiltIn();
}

/**
* Reads all built-in language profiles from the "languages" folder (shipped with the jar).
*/
public List<LanguageProfile> readAllBuiltIn() throws IOException {
List<LanguageProfile> loaded = new ArrayList<>();
for (LdLocale locale : BuiltInLanguages.getLanguages()) {
final List<LdLocale> languages = BuiltInLanguages.getLanguages();
List<LanguageProfile> loaded = new ArrayList<>(languages.size());
for (LdLocale locale : languages) {
loaded.add(readBuiltIn(locale));
}
return loaded;
Expand Down Expand Up @@ -148,20 +150,17 @@ public boolean accept(File pathname) {

List<LanguageProfile> profiles = new ArrayList<>(listFiles.length);
for (File file: listFiles) {
if (!looksLikeLanguageProfileFile(file)) {
continue;
if (looksLikeLanguageProfileFile(file)) {
profiles.add(read(file));
}
profiles.add(read(file));
}
return profiles;
}

private boolean looksLikeLanguageProfileFile(File file) {
if (!file.isFile()) {
return false;
}
return looksLikeLanguageProfileName(file.getName());
return file.isFile() ? looksLikeLanguageProfileName(file.getName()) : false;
}

private boolean looksLikeLanguageProfileName(String fileName) {
if (fileName.contains(".")) {
return false;
Expand All @@ -173,5 +172,4 @@ private boolean looksLikeLanguageProfileName(String fileName) {
return false;
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ public void write(@NotNull LanguageProfile languageProfile, @NotNull OutputStrea
for (Map.Entry<String, Integer> entry : languageProfile.iterateGrams()) {
if (!first) {
writer.write(',');
} else {
first = false;
}
writer.write('"');
writer.write(entry.getKey());
writer.write("\":");
writer.write(entry.getValue().toString());
first = false;
}
writer.write("},\"n_words\":[");
first = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,7 @@ public class MultiTextFilter implements TextFilter {
* @param filters may be empty by definition
*/
public MultiTextFilter(@NotNull List<TextFilter> filters) {
if (filters.isEmpty()) {
this.filters = null;
} else {
this.filters = ImmutableList.copyOf(filters);
}
this.filters = filters.isEmpty() ? null : ImmutableList.copyOf(filters);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,7 @@ private Map<Character.UnicodeScript, Long> countByScript(CharSequence text) {
}
private void increment(Map<Character.UnicodeScript, Long> counter, Character.UnicodeScript unicodeScript) {
Long number = counter.get(unicodeScript);
if (number==null) {
counter.put(unicodeScript, 1L);
} else {
counter.put(unicodeScript, number+1);
}
counter.put(unicodeScript, number == null ? 1L : number+1);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ public class TextObject implements CharSequence, Appendable {

private final int maxTextLength;


/**
* @param maxTextLength 0 for no limit
*/
Expand Down
Loading