Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
 
 package org.apache.ctakes.temporal.ae.feature.selection;
 
 import java.io.File;
 import java.net.URI;
 import java.util.Set;
 
 import  org.cleartk.classifier.Feature;
 import  org.cleartk.classifier.Instance;
 import  org.cleartk.classifier.feature.transform.TransformableFeature;
 
Selects features via Chi-squared statistics between the features extracted from its sub-extractor and the outcome values they are paired with in classification instances.

Author(s):
Chen Lin
 
 public class Chi2FeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {

Helper class for aggregating and computing mutual Chi2 statistics
 
 	private static class Chi2Scorer<OUTCOME_T> implements Function<StringDouble> {
 		protected Multiset<OUTCOME_T> classCounts;
 
 		protected Table<String, OUTCOME_T, IntegerfeatValueClassCount;
 
 		private boolean yates = false;
 
 		public Chi2Scorer(boolean yate) {
 			this. = HashMultiset.<OUTCOME_T> create();
 			this. = HashBasedTable.<String, OUTCOME_T, Integercreate();
 			this. = yate;
 		}
 
 		public void update(String featureName, OUTCOME_T outcomeint occurrences) {
 			Integer count = this..get(featureNameoutcome);
 			if (count == null) {
 				count = 0;
 			}
 			this..put(featureNameoutcomecount + occurrences);
 			this..add(outcomeoccurrences);
 		}
 
 		public Double apply(String featureName) {
 			return this.score(featureName);
 		}
 
 		public double score(String featureName) {
 			// notation index of 0 means false, 1 mean true
 			// Contingency Table:
 			//      | class1  | class2  | class3  | sum
 			// posi |         |         |         | posiFeatCount
 			// nega |         |         |         | negaFeatCount
 			//      | outcnt1 | outcnt2 | outcnt3 | n
 
 			int numOfClass = this..elementSet().size();
 			int[] posiOutcomeCounts = new int[numOfClass];
 			int[] outcomeCounts = new int[numOfClass];
 			int classId = 0;
 			int posiFeatCount = 0;
 			for (OUTCOME_T clas : this..elementSet()) {
 				posiOutcomeCounts[classId] = this..contains(featureNameclas)
 						? this..get(featureNameclas)
 								: 0;
 						posiFeatCount += posiOutcomeCounts[classId];
						outcomeCounts[classId] = this..count(clas);
						classId++;
			}
			int n = this..size();
			int negaFeatCount = n - posiFeatCount;
			double chi2val = 0.0;
			if (posiFeatCount == 0 || posiFeatCount == n) { // all instances have same value on this
				// feature, degree of freedom = 0
				return chi2val;
			}
			for (int lbl = 0; lbl < numOfClasslbl++) {
				// for positive part of feature:
				double expected = (outcomeCounts[lbl] / (doublen) * (posiFeatCount);
				if (expected > 0) {
					double diff = Math.abs(posiOutcomeCounts[lbl] - expected);
					if (this. ) { // apply Yate's correction
						diff -= 0.5;
					}
					if (diff > 0)
						chi2val += Math.pow(diff, 2) / expected;
				}
				// for negative part of feature:
				expected = (outcomeCounts[lbl] / (doublen) * (negaFeatCount);
				double observ = outcomeCounts[lbl] - posiOutcomeCounts[lbl];
				if (expected > 0) {
					double diff = Math.abs(observ - expected);
					if (this.) { // apply Yate's correction
						diff -= 0.5;
					}
					if (diff > 0)
						chi2val += Math.pow(diff, 2) / expected;
				}
			}
			return chi2val;
		}
	}

the percentage of total features that would be returned. range from 0% - 100%, i.e. [0,1]
	private double chi2Threshold;
	private int numFeatures = 0;
	private Chi2Scorer<OUTCOME_T> chi2Function;
	private boolean yates = false;
	public Chi2FeatureSelection(String name) {
		this(name, 0.0);
	}
	public Chi2FeatureSelection(String namedouble threshold) {
		super(name);
		this. = threshold;
	}

Constructor that can let use control the yate's correction

Parameters:
name
threshold
yates : true for using yate's correction, false for turn off yate's correction
	public Chi2FeatureSelection(String namedouble thresholdboolean yates) {
		super(name);
		this. = threshold;
		this. = yates;
	}
	public boolean apply(Feature feature) {
		return this..contains(this.getFeatureName(feature));
	}
	public void train(Iterable<Instance<OUTCOME_T>> instances) {
		//check if chi2Threshold is bigger whithin range:
		if(this.<0 || this.>1){
			..println("Feature Selection threshold should be from 0 to 1");
			System.exit(0);
		}
		// aggregate statistics for all features
		this. = new Chi2Scorer<OUTCOME_T>(this.);
		for (Instance<OUTCOME_T> instance : instances) {
			OUTCOME_T outcome = instance.getOutcome();
			for (Feature feature : instance.getFeatures()) {
				if (this.isTransformable(feature)) {
					for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
						this..update(this.getFeatureName(untransformedFeature), outcome, 1);
					}
				}
			}
		}
		//    // keep only large chi2 valued features
		//    this.selectedFeatureNames = Sets.newHashSet();
		//    for (String featureName : this.chi2Function.featValueClassCount.rowKeySet()) {
		//      if (this.chi2Function.score(featureName) > this.chi2Threshold) {
		//        this.selectedFeatureNames.add(featureName);
		//      }
		//    }
		// sort features by Chi2 information score
		Ordering<Stringordering = Ordering.natural().onResultOf(this.).reverse();
		int totalFeatures = featureNames.size();
		this. = (int) Math.round(totalFeatures*this.);
		// keep only the top N features
				0,
		this. = Sets.newLinkedHashSet(ordering.immutableSortedCopy(featureNames).subList(this.totalFeatures));
		this. = true;
	}
	public void save(URI urithrows IOException {
		if (!this.) {
			throw new IllegalStateException("Cannot save before training");
		}
		File out = new File(uri);
		final String uriPath = uri.getPath();
		final int lastIndex = uriPath.lastIndexOf('.');
		final String discardPath = (lastIndex >= 0 ? uriPath.substring(0, lastIndex) : uriPath ) + "_discarded.dat";
		final File discardOut = new FilediscardPath );
//		File discardOut = new File(uri.getPath().substring(0,uri.getPath().lastIndexOf(".")) + "_discarded.dat");
		BufferedWriter writer = new BufferedWriter(new FileWriter(out));
		BufferedWriter diswriter = new BufferedWriter(new FileWriter(discardOut));
		for (String feature : this.) {
			writer.append(String.format("%s\t%f\n"featurethis..score(feature)));
		}
		for (String feature : this. ){
			diswriter.append(String.format("%s\t%f\n"featurethis..score(feature)));
		}
		writer.close();
		diswriter.close();
	}
	public void load(URI urithrows IOException {
		File in = new File(uri);
		BufferedReader reader = new BufferedReader(new FileReader(in));
		// The lines are <feature-name>\t<feature-score>
		String line = null;
		int n = 0;
		while ((line = reader.readLine()) != null && n < this.) {
			String[] featureValuePair = line.split("\t");
			this..add(featureValuePair[0]);
			n++;
		}
		reader.close();
		this. = true;
	}
New to GrepCode? Check out our FAQ X