Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
 
 
 package org.apache.mahout.df.data;
 
 import java.util.List;
 
 import  org.apache.hadoop.fs.FSDataInputStream;
 import  org.apache.hadoop.fs.FileSystem;
 import  org.apache.hadoop.fs.Path;
Converts the input data to a Vector Array using the information given by the Dataset.
Generates for each line a Vector that contains :
  • double parsed value for NUMERICAL attributes
  • int value for CATEGORICAL and LABEL attributes

adds an IGNORED first attribute that will contain a unique id for each instance, which is the line number of the instance in the input data
 
 public class DataLoader {
 
   private static final Logger log = LoggerFactory.getLogger(DataLoader.class);
 
   private DataLoader() {
   }

  
Converts a comma-separated String to a Vector.

Parameters:
id unique id for the current instance
attrs attributes description
values used to convert CATEGORICAL attribute values to Integer
string
Returns:
null if there are missing values '?'
 
   private static Instance parseString(int idAttribute[] attrs,
       List<String>[] valuesString string) {
     StringTokenizer tokenizer = new StringTokenizer(string", ");
     if (tokenizer.countTokens() != attrs.length) {
       .error(id + ": " + string);
       throw new IllegalArgumentException("Wrong number of attributes in the string");
     }
 
     // extract tokens and check is there is any missing value
     String[] tokens = new String[attrs.length];
     for (int attr = 0; attr < attrs.lengthattr++) {
       String token = tokenizer.nextToken();
 
       if (attrs[attr].isIgnored())
         continue;
 
       if ("?".equals(token))
         return null// missing value
 
       tokens[attr] = token;
     }
 
     int nbattrs = Dataset.countAttributes(attrs);
 
     DenseVector vector = new DenseVector(nbattrs);
 
     int aId = 0;
     int label = -1;
     for (int attr = 0; attr < attrs.lengthattr++) {
       if (attrs[attr].isIgnored())
         continue;
 
       String token = tokens[attr];
 
       if (attrs[attr].isNumerical()) {
         vector.set(aId++, Double.parseDouble(token));
       } else { // CATEGORICAL or LABEL
         // update values
        if (values[attr] == null)
          values[attr] = new ArrayList<String>();
        if (!values[attr].contains(token))
          values[attr].add(token);
        if (attrs[attr].isCategorical()) {
          vector.set(aId++, values[attr].indexOf(token));
        } else { // LABEL
          label = values[attr].indexOf(token);
        }
      }
    }
    if (label == -1)
      throw new IllegalStateException("Label not found!");
    return new Instance(idvectorlabel);
  }

  
Loads the data from a file

Parameters:
dataset
fs file system
fpath data file path
Returns:
Throws:
IOException if any problem is encountered
  
  public static Data loadData(Dataset dataset, FileSystem fs, Path fpaththrows IOException {
    FSDataInputStream input = fs.open(fpath);
    Scanner scanner = new Scanner(input);
    
    List<Instanceinstances = new ArrayList<Instance>();
    DataConverter converter = new DataConverter(dataset);
    
    while (scanner.hasNextLine()) {
      String line = scanner.nextLine();
      if (line.isEmpty()) {
        .warn(instances.size() + ": empty string");
        continue;
      }
      
      Instance instance = converter.convert(instances.size(), line);
      if (instance == null) {
        // missing values found
        .warn(instances.size() + ": missing values");
        continue;
      }
      
      instances.add(instance);
    }
    scanner.close();
    
    return new Data(datasetinstances);
  }

  
Loads the data from a String array
  public static Data loadData(Dataset datasetString[] data) {
    List<Instanceinstances = new ArrayList<Instance>();
    DataConverter converter = new DataConverter(dataset);
    
    for (String line : data) {
      if (line.isEmpty()) {
        .warn(instances.size() + ": empty string");
        continue;
      }
      
      Instance instance = converter.convert(instances.size(), line);
      if (instance == null) {
        // missing values found
        .warn(instances.size() + ": missing values");
        continue;
      }
      
      instances.add(instance);
    }
    
    return new Data(datasetinstances);
  }

  
Generates the Dataset by parsing the entire data

Parameters:
descriptor attributes description
fs file system
path data path
  public static Dataset generateDataset(String descriptor, FileSystem fs, Path path)
      throws DescriptorExceptionIOException {
    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
    FSDataInputStream input = fs.open(path);
    Scanner scanner = new Scanner(input);
    // used to convert CATEGORICAL attribute to Integer
    List<String>[] values = new List[attrs.length];
    
    int id = 0;
    while (scanner.hasNextLine()) {
      String line = scanner.nextLine();
      if (line.isEmpty()) {
        continue;
      }
      
      if (parseString(idattrsvaluesline) != null) {
        id++;
      }
    }
    scanner.close();
    return new Dataset(attrsvaluesid);
  }

  
Generates the Dataset by parsing the entire data

Parameters:
descriptor attributes description
data
  public static Dataset generateDataset(String descriptorString[] datathrows DescriptorException {
    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
    // used to convert CATEGORICAL and LABEL attributes to Integer
    List<String>[] values = new List[attrs.length];
    int id = 0;
    for (String aData : data) {
      if (aData.isEmpty()) {
        continue;
      }
      if (parseString(idattrsvaluesaData) != null) {
        id++;
      }
    }
    return new Dataset(attrsvaluesid);
  }
  
  
constructs the data

Parameters:
attrs attributes description
vectors data elements
values used to convert CATEGORICAL attributes to Integer
Returns:
Throws:
RuntimeException if no LABEL is found in the attributes description
  protected static Data constructData(Attribute[] attrs,
      List<InstancevectorsList<String>[] values) {
    Dataset dataset = new Dataset(attrsvaluesvectors.size());
    return new Data(datasetvectors);
  }
New to GrepCode? Check out our FAQ X