Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
Copyright 2012 Ubiquitous Knowledge Processing (UKP) Lab Technische Universit├Ąt Darmstadt Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. /
 
 package de.tudarmstadt.ukp.dkpro.core.berkeleyparser;
 
 import static org.apache.commons.io.IOUtils.closeQuietly;
 import static org.apache.uima.util.Level.INFO;
 import static org.uimafit.util.JCasUtil.select;
 import static org.uimafit.util.JCasUtil.selectCovered;
 import static org.uimafit.util.JCasUtil.toText;
 
 import java.net.URL;
 import java.util.List;
 
 import  org.apache.commons.lang.mutable.MutableInt;
 import  org.apache.uima.UimaContext;
 import  org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import  org.apache.uima.cas.CAS;
 import  org.apache.uima.cas.Type;
 import  org.apache.uima.jcas.JCas;
 import  org.apache.uima.jcas.cas.FSArray;
 import  org.apache.uima.jcas.tcas.Annotation;
 import  org.apache.uima.resource.ResourceInitializationException;
 import  org.uimafit.component.JCasAnnotator_ImplBase;
 import  org.uimafit.descriptor.ConfigurationParameter;
 import  org.uimafit.util.FSCollectionFactory;
 
 import  de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
 import  de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
 import  de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
 import  de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
 import  de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 import  de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 import  de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree;
 import  de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent;
Berkeley Parser annotator. Requires Sentences to be annotated before.

Author(s):
Richard Eckart de Castilho
 
 public class BerkeleyParser
 	extends JCasAnnotator_ImplBase
 {
 	private static final String CONPACKAGE = Constituent.class.getPackage().getName() + ".";
 
 	public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
 	@ConfigurationParameter(name = , mandatory = false)
 	protected String language;
 
 	public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
 	@ConfigurationParameter(name = , mandatory = false)
 	protected String variant;
 
 	public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
 	@ConfigurationParameter(name = , mandatory = false)
 	protected String modelLocation;
 
 	public static final String PARAM_TAGGER_MAPPING_LOCATION = ComponentParameters.PARAM_TAGGER_MAPPING_LOCATION;
 	@ConfigurationParameter(name = , mandatory = false)
 	protected String mappingLocation;
 
 	public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS;
 	@ConfigurationParameter(name = , mandatory = false, defaultValue = "true")
 	private boolean internTags;
 
 	public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
 	@ConfigurationParameter(name = , mandatory = true, defaultValue = "false")
 	protected boolean printTagSet;

Sets whether to create or not to create POS tags. The creation of constituent tags must be turned on for this to work.
Default: true
	public static final String PARAM_CREATE_POS_TAGS = "createPosTags";
	@ConfigurationParameter(name = , mandatory = true, defaultValue = "true")
	private boolean createPosTags;

If this parameter is set to true, each sentence is annotated with a PennTree-Annotation, containing the whole parse tree in Prenn Treebank style format.
Default: false
	public static final String PARAM_CREATE_PENN_TREE_STRING = "createPennTreeString";
	@ConfigurationParameter(name = , mandatory = true,
			defaultValue = "false")
	private boolean createPennTreeString;
	@Option(name = "-viterbi", usage = "Compute viterbi derivation instead of max-rule tree (Default: max-rule)")
	public boolean viterbi;
    @Option(name = "-substates", usage = "Output subcategories (only for binarized viterbi trees). (Default: false)")
	public boolean substates;
	@Option(name = "-scores", usage = "Output inside scores (only for binarized viterbi trees). (Default: false)")
	public boolean scores;
	@Option(name = "-accurate", usage = "Set thresholds for accuracy. (Default: set thresholds for efficiency)")
	public boolean accurate;
	@Option(name = "-variational", usage = "Use variational rule score approximation instead of max-rule (Default: false)")
	public boolean variational;
    @Option(name = "-keepFunctionLabels", usage = "Retain predicted function labels. Model must have been trained with function labels. (Default: false)")
    public boolean keepFunctionLabels;
    
	@Option(name = "-binarize", usage = "Output binarized trees. (Default: false)")
	public boolean binarize;
	private CasConfigurableProviderBase<ParsermodelProvider;
	private MappingProvider mappingProvider;
	public void initialize(UimaContext aContext)
		throws ResourceInitializationException
	{
		super.initialize(aContext);
		 = new CasConfigurableProviderBase<Parser>()
		{
			{
				setDefault(VERSION, "20090917.0");
				setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
				setDefault(ARTIFACT_ID,
						"de.tudarmstadt.ukp.dkpro.core.berkeleyparser-model-parser-${language}-${variant}");
						"de/tudarmstadt/ukp/dkpro/core/berkeleyparser/lib/parser-default-variants.map");
				setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/lib/"
"parser-${language}-${variant}.bin");
				setOverride(LOCATION, );
				setOverride(LANGUAGE, );
				setOverride(VARIANT, );
			}
			protected Parser produceResource(URL aUrl)
			{
				ObjectInputStream is = null;
				try {
					is = new ObjectInputStream(new GZIPInputStream(aUrl.openStream()));
					ParserData pData = (ParserDatais.readObject();
					Grammar grammar = pData.getGrammar();
					Lexicon lexicon = pData.getLexicon();
					Numberer.setNumberers(pData.getNumbs());
					double threshold = 1.0;
					if () {
						List<StringposTags = new ArrayList<String>();
						List<StringconstTags = new ArrayList<String>();
						Numberer tagNumberer = (NumbererpData.getNumbs().get("tags");
						for (int i = 0; i < tagNumberer.size(); i++) {
							String tag = (StringtagNumberer.object(i);
							if (! && tag.startsWith("@")) {
								continue// Only show aux. binarization tags if it is enabled.
							}
							if (tag.endsWith("^g")) {
								constTags.add(tag.substring(0, tag.length()-2));
							}
							else {
								posTags.add(tag);
							}
						}
						printTags("tagger"posTags);
						printTags("parser"constTags);
					}
					return new CoarseToFineMaxRuleParser(grammarlexiconthreshold, -1,
				}
					throw new IOException(e);
				}
				finally {
				}
			}
		};
		 = new MappingProvider();
		.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/"
"core/api/lexmorph/tagset/${language}-${tagger.tagset}-tagger.map");
		.setDefault(MappingProvider.BASE_TYPE, POS.class.getName());
		.setDefault("tagger.tagset""default");
		.setOverride(MappingProvider.LOCATION, );
		.setOverride(MappingProvider.LANGUAGE, );
		.addImport("tagger.tagset");
	}
	private void printTags(String aTypeList<StringaTags)
	{
		Collections.sort(aTags);
		sb.append("Model of " + aType + " contains [").append(aTags.size()).append("] tags: ");
		for (String tag : aTags) {
			sb.append(tag);
			sb.append(" ");
		}
		getContext().getLogger().log(INFO, sb.toString());
	}
	public void process(JCas aJCas)
		throws AnalysisEngineProcessException
	{
		CAS cas = aJCas.getCas();
		.configure(cas);
		.configure(cas);
		for (Sentence sentence : select(aJCas, Sentence.class)) {
			List<Token> tokens = selectCovered(aJCas, Token.classsentence);
			List<StringtokenText = toText(tokens);
			Tree<StringparseOutput = .getResource().getBestParse(tokenText);
			if (!) {
				parseOutput = TreeAnnotations.unAnnotateTree(parseOutput);
			}
			createConstituentAnnotationFromTree(aJCasparseOutputnulltokensnew MutableInt(0));
				PennTree pTree = new PennTree(aJCassentence.getBegin(), sentence.getEnd());
				pTree.setPennTree(parseOutput.toString());
				pTree.addToIndexes();
			}
		}
	}

Creates linked constituent annotations + POS annotations

Parameters:
aNode the source tree
aParentFS
aCreatePos sets whether to create or not to create POS tags
aCreateLemmas sets whether to create or not to create Lemmas
Returns:
the child-structure (needed for recursive call only)
	private Annotation createConstituentAnnotationFromTree(JCas aJCasTree<StringaNode,
			Annotation aParentFSList<Token> aTokens, MutableInt aIndex)
	{
		// If the node is a word-level constituent node (== POS):
		// create parent link on token and (if not turned off) create POS tag
		if (aNode.isPreTerminal()) {
			Token token = aTokens.get(aIndex.intValue());
			// link token to its parent constituent
			if (aParentFS != null) {
				token.setParent(aParentFS);
			}
			// only add POS to index if we want POS-tagging
				String typeName = aNode.getLabel();
				Type posTag = .getTagType(typeName);
				POS posAnno = (POS) aJCas.getCas().createAnnotation(posTagtoken.getBegin(),
						token.getEnd());
				posAnno.setPosValue( ? typeName.intern() : typeName);
				posAnno.addToIndexes();
				token.setPos((POS) posAnno);
			}
			aIndex.add(1);
			return token;
		}
		// Check if node is a constituent node on sentence or phrase-level
		else {
			String typeName = aNode.getLabel();
			// create the necessary objects and methods
			String constituentTypeName =  + typeName;
			Type type = aJCas.getTypeSystem().getType(constituentTypeName);
			// if type is unknown, map to X-type
			if (type == null) {
				type = aJCas.getTypeSystem().getType( + "X");
			}
			Constituent constAnno = (Constituent) aJCas.getCas().createAnnotation(type, 0, 0);
			constAnno.setConstituentType(typeName);
			// link to parent
			if (aParentFS != null) {
				constAnno.setParent(aParentFS);
			}
			// Do we have any children?
			List<Annotation> childAnnotations = new ArrayList<Annotation>();
			for (Tree<Stringchild : aNode.getChildren()) {
				Annotation childAnnotation = createConstituentAnnotationFromTree(aJCaschild,
						constAnnoaTokensaIndex);
				if (childAnnotation != null) {
					childAnnotations.add(childAnnotation);
				}
			}
			constAnno.setBegin(childAnnotations.get(0).getBegin());
			constAnno.setEnd(childAnnotations.get(childAnnotations.size()-1).getEnd());
			// Now that we know how many children we have, link annotation of
			// current node with its children
			FSArray childArray = (FSArray) FSCollectionFactory.createFSArray(aJCas,
					childAnnotations);
			constAnno.setChildren(childArray);
			// write annotation for current node to index
			aJCas.addFsToIndexes(constAnno);
			return constAnno;
		}
	}

Given a list of tokens (e.g. those from a sentence) return the one at the specified position.
	private Token getToken(List<Token> aTokensint aBeginint aEnd)
	{
		for (Token t : aTokens) {
			if (aBegin == t.getBegin() && aEnd == t.getEnd()) {
				return t;
			}
		}
		throw new IllegalStateException("Token not found");
	}
New to GrepCode? Check out our FAQ X