Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
 
 
 package org.apache.mahout.classifier;
 
 import  org.apache.commons.cli2.CommandLine;
 import  org.apache.commons.cli2.Group;
 import  org.apache.commons.cli2.Option;
 import  org.apache.commons.cli2.OptionException;
 import  org.apache.commons.cli2.builder.ArgumentBuilder;
 import  org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import  org.apache.commons.cli2.builder.GroupBuilder;
 import  org.apache.commons.cli2.commandline.Parser;
 
 import java.io.File;
 import java.io.Reader;
 import java.io.Writer;
 import java.util.List;

Flatten a file into format that can be read by the Bayes M/R job.

One document per line, first token is the label followed by a tab, rest of the line are the terms.

 
 public class BayesFileFormatter {
 
   private static final Logger log = LoggerFactory.getLogger(BayesFileFormatter.class);
 
   private static final String LINE_SEP = System.getProperty("line.separator");
 
   private BayesFileFormatter() {
   }

  
Collapse all the files in the inputDir into a single file in the proper Bayes format, 1 document per line

Parameters:
label The label
analyzer The analyzer to use
inputDir The input Directory
charset The charset of the input files
outputFile The file to collapse to
 
   public static void collapse(String labelAnalyzer analyzerFile inputDir,
                               Charset charsetFile outputFilethrows IOException {
     Writer writer = new OutputStreamWriter(new FileOutputStream(outputFile),
         charset);
     try {
       inputDir.listFiles(new FileProcessor(labelanalyzercharsetwriter));
       // listFiles() is called here as a way to recursively visit files, actually
     } finally {
       IOUtils.quietClose(writer);
     }
   }

  
Write the input files to the outdir, one output file per input file

Parameters:
label The label of the file
analyzer The analyzer to use
input The input file or directory. May not be null
charset The Character set of the input files
outDir The output directory. Files will be written there with the same name as the input file
 
   public static void format(String labelAnalyzer analyzerFile input,
                             Charset charsetFile outDirthrows IOException {
     if (input.isDirectory()) {
       input.listFiles(new FileProcessor(labelanalyzercharsetoutDir));
     } else {
       Writer writer = new OutputStreamWriter(new FileOutputStream(new File(
           outDirinput.getName())), charset);
       try {
        writeFile(labelanalyzerinputcharsetwriter);
      } finally {
        IOUtils.quietClose(writer);
      }
    }
  }

  
Hack the FileFilter mechanism so that we don't get stuck on large directories and don't have to loop the list twice
  private static class FileProcessor implements FileFilter {
    private final String label;
    private final Analyzer analyzer;
    private File outputDir;
    private final Charset charset;
    private Writer writer;

    
Use this when you want to collapse all files to a single file

Parameters:
label The label
writer must not be null and will not be closed
    private FileProcessor(String labelAnalyzer analyzerCharset charset,
                          Writer writer) {
      this. = label;
      this. = analyzer;
      this. = charset;
      this. = writer;
    }

    
Use this when you want a writer per file

Parameters:
outputDir must not be null.
    private FileProcessor(String labelAnalyzer analyzerCharset charset,
                          File outputDir) {
      this. = label;
      this. = analyzer;
      this. = charset;
      this. = outputDir;
    }
    @Override
    public boolean accept(File file) {
      if (file.isFile()) {
        Writer theWriter = null;
        try {
          if ( == null) {
            theWriter = new OutputStreamWriter(new FileOutputStream(new File(
                file.getName())), );
          } else {
            theWriter = ;
          }
          writeFile(filetheWriter);
          if ( != null) {
            // just write a new line
            theWriter.write();
          }
        } catch (IOException e) {
          // TODO: report failed files instead of throwing exception
          throw new IllegalStateException(e);
        } finally {
          if ( == null) {
            IOUtils.quietClose(theWriter);
          }
        }
      } else {
        file.listFiles(this);
      }
      return false;
    }
  }

  
Write the tokens and the label from the Reader to the writer

Parameters:
label The label
analyzer The analyzer to use
inFile the file to read and whose contents are passed to the analyzer
charset character encoding to assume when reading the input file
writer The Writer, is not closed by this method
Throws:
java.io.IOException if there was a problem w/ the reader
  private static void writeFile(String labelAnalyzer analyzerFile inFile,
                                Charset charsetWriter writerthrows IOException {
    Reader reader = new InputStreamReader(new FileInputStream(inFile), charset);
    try {
      TokenStream ts = analyzer.tokenStream(labelreader);
      writer.write(label);
      writer.write('\t'); // edit: Inorder to match Hadoop standard
      // TextInputFormat
      Token token = new Token();
      while ((token = ts.next(token)) != null) {
        char[] termBuffer = token.termBuffer();
        int termLen = token.termLength();
        writer.write(termBuffer, 0, termLen);
        writer.write(' ');
      }
    } finally {
      IOUtils.quietClose(reader);
    }
  }

  
Convert a Reader to a vector

Parameters:
analyzer The Analyzer to use
reader The reader to feed to the Analyzer
Returns:
An array of unique tokens
  public static String[] readerToDocument(Analyzer analyzerReader reader)
      throws IOException {
    TokenStream ts = analyzer.tokenStream(""reader);
    Token token;
    List<Stringcoll = new ArrayList<String>();
    while ((token = ts.next()) != null) {
      char[] termBuffer = token.termBuffer();
      int termLen = token.termLength();
      String val = new String(termBuffer, 0, termLen);
      coll.add(val);
    }
    return coll.toArray(new String[coll.size()]);
  }

  
Run the FileFormatter

Parameters:
args The input args. Run with -h to see the help
Throws:
ClassNotFoundException if the Analyzer can't be found
IllegalAccessException if the Analyzer can't be constructed
InstantiationException if the Analyzer can't be constructed
IOException if the files can't be dealt with properly
  public static void main(String[] argsthrows ClassNotFoundException,
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
        abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
        withDescription("The Input file").withShortName("i").create();
    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
        withDescription("The output file").withShortName("o").create();
    Option labelOpt = obuilder.withLongName("label").withRequired(true).withArgument(
        abuilder.withName("label").withMinimum(1).withMaximum(1).create()).
        withDescription("The label of the file").withShortName("l").create();
    Option analyzerOpt = obuilder.withLongName("analyzer").withArgument(
        abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()).
        withDescription("The fully qualified class name of the analyzer to use.  Must have a no-arg constructor.  Default is the StandardAnalyzer").withShortName("a").create();
    Option charsetOpt = obuilder.withLongName("charset").withArgument(
        abuilder.withName("charset").withMinimum(1).withMaximum(1).create()).
        withDescription("The character encoding of the input file").withShortName("c").create();
    Option collapseOpt = obuilder.withLongName("collapse").withRequired(true).withArgument(
        abuilder.withName("collapse").withMinimum(1).withMaximum(1).create()).
        withDescription("Collapse a whole directory to a single file, one doc per line").withShortName("p").create();
    Option helpOpt = obuilder.withLongName("help").withRequired(true).
        withDescription("Print out help").withShortName("h").create();
    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(labelOpt).withOption(analyzerOpt).withOption(charsetOpt).withOption(collapseOpt).withOption(helpOpt).create();
    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        return;
      }
      File input = new File((StringcmdLine.getValue(inputOpt));
      File output = new File((StringcmdLine.getValue(outputOpt));
      String label = (StringcmdLine.getValue(labelOpt);
      Analyzer analyzer;
      if (cmdLine.hasOption(analyzerOpt)) {
        analyzer = Class.forName(
            (StringcmdLine.getValue(analyzerOpt)).asSubclass(Analyzer.class).newInstance();
      } else {
        analyzer = new StandardAnalyzer();
      }
      Charset charset = Charset.forName("UTF-8");
      if (cmdLine.hasOption(charsetOpt)) {
        charset = Charset.forName((StringcmdLine.getValue(charsetOpt));
      }
      boolean collapse = cmdLine.hasOption(collapseOpt);
      if (collapse) {
        collapse(labelanalyzerinputcharsetoutput);
      } else {
        format(labelanalyzerinputcharsetoutput);
      }
    } catch (OptionException e) {
      .error("Exception"e);
    }
  }
New to GrepCode? Check out our FAQ X