/*
 * Decompiled with CFR 0.152.
 */
package iitb.cfilt.cpost.tokenizer;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.text.BreakIterator;
import java.util.Vector;

public class Tokenizer {
    private Vector<String> tokens = new Vector();

    public Vector<String> tokenize(String filename) {
        try {
            BufferedReader btf = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(filename), "UTF8"));
            String line = " ";
            while (line != null) {
                line = btf.readLine();
                if (line == null) continue;
                this.tokens.addAll(this.tokenizeSentence(line));
            }
        }
        catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        return this.tokens;
    }

    private Vector<String> tokenizeSentence(String line) {
        Vector<String> words = new Vector<String>();
        BreakIterator boundary = BreakIterator.getWordInstance();
        boundary.setText(line);
        int start = boundary.first();
        int end = boundary.next();
        while (end != -1) {
            String word = line.substring(start, end);
            if (!word.equals(" ")) {
                words.add(word);
            }
            start = end;
            end = boundary.next();
        }
        return words;
    }
}

