Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
  package org.apache.solr.update.processor;
  
  /*
   * Licensed to the Apache Software Foundation (ASF) under one or more
   * contributor license agreements.  See the NOTICE file distributed with
   * this work for additional information regarding copyright ownership.
   * The ASF licenses this file to You under the Apache License, Version 2.0
   * (the "License"); you may not use this file except in compliance with
   * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 
 import java.util.List;
Identifies the language of a set of input fields. Also supports mapping of field names based on detected language.

See http://wiki.apache.org/solr/LanguageDetection

Since:
3.5
Lucene.experimental:
 
 public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestProcessor implements LangIdParams {
 
   protected final static Logger log = LoggerFactory
           .getLogger(LanguageIdentifierUpdateProcessor.class);
 
   protected boolean enabled;
 
   protected String[] inputFields = {};
   protected String[] mapFields = {};
   protected Pattern mapPattern;
   protected String mapReplaceStr;
   protected String langField;
   protected String langsField// MultiValued, contains all languages detected
   protected String docIdField;
   protected String fallbackValue;
   protected String[] fallbackFields = {};
   protected boolean enableMapping;
   protected boolean mapKeepOrig;
   protected boolean overwrite;
   protected boolean mapOverwrite;
   protected boolean mapIndividual;
   protected boolean enforceSchema;
   protected double threshold;
   protected HashSet<StringlangWhitelist;
   protected HashSet<StringallMapFieldsSet;
   protected HashMap<String,StringlcMap;
   protected HashMap<String,StringmapLcMap;
   protected IndexSchema schema;
 
   // Regex patterns
   protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
   protected final Pattern langPattern = Pattern.compile("\\{lang\\}");
 
                                            SolrQueryResponse rspUpdateRequestProcessor next) {
     super(next);
      = req.getSchema();
 
     initParams(req.getParams());
   }
 
   private void initParams(SolrParams params) {
     if (params != null) {
       // Document-centric langId params
       setEnabled(params.getBool(true));
       if(params.get("").length() > 0) {
          = params.get("").split(",");
       }
      SchemaField uniqueKeyField = .getUniqueKeyField();
       = params.get(uniqueKeyField == null ?  : uniqueKeyField.getName());
       = params.get();
      if(params.get("").length() > 0) {
         = params.get().split(",");
      }
       = params.getBool(false);
       = new HashSet<String>();
      if(params.get("").length() > 0) {
        for(String lang : params.get("").split(",")) {
          .add(lang);
        }
      }
      // Mapping params (field centric)
       = params.getBool(false);
      if(params.get("").length() > 0) {
         = params.get("").split(",");
      } else {
         = ;
      }
       = params.getBool(false);
       = params.getBool(false);
       = params.getBool(false);
      // Process individual fields
      String[] mapIndividualFields = {};
      if(params.get("").length() > 0) {
        mapIndividualFields = params.get("").split(",");
      } else {
        mapIndividualFields = ;
      }
       = new HashSet<String>(Arrays.asList(mapIndividualFields));
      // Compile a union of the lists of fields to map
       = new HashSet<String>(Arrays.asList());
      if(Arrays.equals(mapIndividualFields)) {
      }
      // Normalize detected langcode onto normalized langcode
       = new HashMap<String,String>();
      if(params.get() != null) {
        for(String mapping : params.get().split("[, ]")) {
          String[] keyVal = mapping.split(":");
          if(keyVal.length == 2) {
            .put(keyVal[0], keyVal[1]);
          } else {
            .error("Unsupported format for langid.lcmap: "+mapping+". Skipping this mapping.");
          }
        }
      }
      // Language Code mapping
       = new HashMap<String,String>();
      if(params.get() != null) {
        for(String mapping : params.get().split("[, ]")) {
          String[] keyVal = mapping.split(":");
          if(keyVal.length == 2) {
            .put(keyVal[0], keyVal[1]);
          } else {
            .error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping.");
          }
        }
      }
       = params.getBool(true);
       = Pattern.compile(params.get());
    }
    .debug("LangId configured");
    if (. == 0) {
      throw new SolrException(.,
              "Missing or faulty configuration of LanguageIdentifierUpdateProcessor. Input fields must be specified as a comma separated list");
    }
  }
  public void processAdd(AddUpdateCommand cmdthrows IOException {
    if (isEnabled()) {
      process(cmd.getSolrInputDocument());
    } else {
      .debug("Processor not enabled, not running");
    }
    super.processAdd(cmd);
  }

  
This is the main, testable process method called from processAdd()

Parameters:
doc the SolrInputDocument to work on
Returns:
the modified SolrInputDocument
    String docLang = null;
    HashSet<StringdocLangs = new HashSet<String>();
    String fallbackLang = getFallbackLang(doc);
    if( == null || !doc.containsKey() || (doc.containsKey() && )) {
      String allText = concatFields(doc);
      List<DetectedLanguagelanguagelist = detectLanguage(allText);
      docLang = resolveLanguage(languagelistfallbackLang);
      docLangs.add(docLang);
      .debug("Detected main document language from fields "++": "+docLang);
      if(doc.containsKey() && ) {
        .debug("Overwritten old value "+doc.getFieldValue());
      }
      if( != null && .length() != 0) {
        doc.setField(docLang);
      }
    } else {
      // langField is set, we sanity check it against whitelist and fallback
      docLang = resolveLanguage((Stringdoc.getFieldValue(), fallbackLang);
      docLangs.add(docLang);
      .debug("Field "++" already contained value "+docLang+", not overwriting.");
    }
    if() {
      for (String fieldName : ) {
        if(doc.containsKey(fieldName)) {
          String fieldLang;
          if( && .contains(fieldName)) {
            String text = (Stringdoc.getFieldValue(fieldName);
            List<DetectedLanguagelanguagelist = detectLanguage(text);
            fieldLang = resolveLanguage(languagelistdocLang);
            docLangs.add(fieldLang);
            .debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
          } else {
            fieldLang = docLang;
            .debug("Mapping field "+fieldName+" using document global language "+fieldLang);
          }
          String mappedOutputField = getMappedField(fieldNamefieldLang);
          if (mappedOutputField != null) {
            .debug("Mapping field {} to {}"doc.getFieldValue(), fieldLang);
            SolrInputField inField = doc.getField(fieldName);
            doc.setField(mappedOutputFieldinField.getValue(), inField.getBoost());
            if(!) {
              .debug("Removing old field {}"fieldName);
              doc.removeField(fieldName);
            }
          } else {
            throw new SolrException(.."Invalid output field mapping for "
                    + fieldName + " field and language: " + fieldLang);
          }
        }
      }
    }
    // Set the languages field to an array of all detected languages
    if( != null && .length() != 0) {
      doc.setField(docLangs.toArray());
    }
    return doc;
  }

  
Decides the fallback language, either from content of fallback field or fallback value

Parameters:
doc the Solr document
fallbackFields an array of strings with field names containing fallback language codes
fallbackValue a language code to use in case no fallbackFields are found
  private String getFallbackLang(SolrInputDocument docString[] fallbackFieldsString fallbackValue) {
    String lang = null;
    for(String field : fallbackFields) {
      if(doc.containsKey(field)) {
        lang = (Stringdoc.getFieldValue(field);
        .debug("Language fallback to field "+field);
        break;
      }
    }
    if(lang == null) {
      .debug("Language fallback to value "+fallbackValue);
      lang = fallbackValue;
    }
    return lang;
  }
  /*
   * Concatenates content from multiple fields
   */
  protected String concatFields(SolrInputDocument docString[] fields) {
    StringBuffer sb = new StringBuffer();
    for (String fieldName : ) {
      .debug("Appending field "+fieldName);
      if (doc.containsKey(fieldName)) {
        Object content = doc.getFieldValue(fieldName);
        if(content instanceof String) {
          sb.append((Stringdoc.getFieldValue(fieldName));
          sb.append(" ");
        } else {
          .warn("Field "+fieldName+" not a String value, not including in detection");
        }
      }
    }
    return sb.toString();
  }

  
Detects language(s) from a string. Classes wishing to implement their own language detection module should override this method.

Parameters:
content The content to identify
Returns:
List of detected language(s) according to RFC-3066
  protected abstract List<DetectedLanguagedetectLanguage(String content);

  
Chooses a language based on the list of candidates detected

Parameters:
language language code as a string
fallbackLang the language code to use as a fallback
Returns:
a string of the chosen language
  protected String resolveLanguage(String languageString fallbackLang) {
    l.add(new DetectedLanguage(language, 1.0));
    return resolveLanguage(lfallbackLang);
  }

  
Chooses a language based on the list of candidates detected

Parameters:
languages a List of DetectedLanguages with certainty score
fallbackLang the language code to use as a fallback
Returns:
a string of the chosen language
  protected String resolveLanguage(List<DetectedLanguagelanguagesString fallbackLang) {
    String langStr;
    if(languages.size() == 0) {
      .debug("No language detected, using fallback {}"fallbackLang);
      langStr = fallbackLang;
    } else {
      DetectedLanguage lang = languages.get(0);
      String normalizedLang = normalizeLangCode(lang.getLangCode());
      if(.isEmpty() || .contains(normalizedLang)) {
        .debug("Language detected {} with certainty {}"normalizedLanglang.getCertainty());
        if(lang.getCertainty() >= ) {
          langStr = normalizedLang;
        } else {
          .debug("Detected language below threshold {}, using fallback {}"fallbackLang);
          langStr = fallbackLang;
        }
      } else {
        .debug("Detected a language not in whitelist ({}), using fallback {}"lang.getLangCode(), fallbackLang);
        langStr = fallbackLang;
      }
    }
    if(langStr == null || langStr.length() == 0) {
      .warn("Language resolved to null or empty string. Fallback not configured?");
      langStr = "";
    }
    return langStr;
  }

  
Looks up language code in map (langid.lcmap) and returns mapped value

Parameters:
langCode the language code string returned from detector
Returns:
the normalized/mapped language code
  protected String normalizeLangCode(String langCode) {
    if (.containsKey(langCode)) {
      String lc = .get(langCode);
      .debug("Doing langcode normalization mapping from "+langCode+" to "+lc);
      return lc;
    }
    return langCode;
  }

  
Returns the name of the field to map the current contents into, so that they are properly analyzed. For instance if the currentField is "text" and the code is "en", the new field would by default be "text_en". This method also performs custom regex pattern replace if configured. If enforceSchema=true and the resulting field name doesn't exist, then null is returned.

Parameters:
currentField The current field name
language the language code
Returns:
The new schema field name, based on pattern and replace, or null if illegal
  protected String getMappedField(String currentFieldString language) {
    String lc = .containsKey(language) ? .get(language) : language;
    String newFieldName = .matcher(.matcher(currentField).replaceFirst()).replaceFirst(lc);
    if( && .getFieldOrNull(newFieldName) == null) {
      .warn("Unsuccessful field name mapping from {} to {}, field does not exist and enforceSchema=true; skipping mapping."currentFieldnewFieldName);
      return null;
    } else {
      .debug("Doing mapping from "+currentField+" with language "+language+" to field "+newFieldName);
    }
    return newFieldName;
  }

  
Tells if this processor is enabled or not

Returns:
true if enabled, else false
  public boolean isEnabled() {
    return ;
  }
  public void setEnabled(boolean enabled) {
    this. = enabled;
  }
New to GrepCode? Check out our FAQ X