/*
 * Decompiled with CFR 0.152.
 */
package slib.sml.sm.core.measures.corpus;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import slib.sml.sm.core.measures.corpus.Matrix;
import slib.sml.sm.core.measures.corpus.MatrixType;
import slib.utils.FileUtils;
import slib.utils.ex.SLIB_Ex_Critic;

public class VocContextMatrixBuilder {
    Logger logger = LoggerFactory.getLogger(this.getClass());
    List<String> docFields = new ArrayList<String>();
    Set<String> vocabulary;
    Matrix matrix;
    MatrixType matrixType;

    public VocContextMatrixBuilder(MatrixType matrixType, Set<String> voc) {
        this.matrixType = matrixType;
        this.vocabulary = voc;
        this.docFields = new ArrayList<String>();
        switch (matrixType) {
            case WORD_WORD: {
                this.matrix = new Matrix();
                break;
            }
            case WORD_DOC: {
                this.matrix = new Matrix();
            }
        }
        this.logger.info("Matrix Builder: " + (Object)((Object)this.matrixType));
    }

    public Matrix getMatrix() {
        return this.matrix;
    }

    public VocContextMatrixBuilder(MatrixType matrixType, Set<String> voc, List<String> docFields) {
        this(matrixType, voc);
        this.setDocFields(docFields);
    }

    public List<String> getDocFields() {
        return this.docFields;
    }

    public final void setDocFields(Collection<String> s) {
        for (String ss : s) {
            this.addDocFields(ss);
        }
    }

    public void addDocFields(String s) {
        if (!this.docFields.contains(s)) {
            this.docFields.add(s);
        }
    }

    public void buildMatrix(Iterable<Document> documents) throws SLIB_Ex_Critic {
        this.logger.info("Building voc-context matrix");
        this.logger.info("voc: " + this.vocabulary);
        this.logger.info("fields: " + this.docFields);
        if (this.docFields.isEmpty()) {
            throw new SLIB_Ex_Critic("Please specify a field to analyse in the given documents");
        }
        for (Document doc : documents) {
            this.process(doc);
        }
    }

    public void process(Document doc) throws SLIB_Ex_Critic {
        this.logger.debug("processing doc: " + doc.toString());
        block4: for (String field : this.docFields) {
            this.logger.info(field + " : " + doc.get(field));
            if (doc.getField(field) == null) {
                this.logger.warn("Skip field " + field);
                break;
            }
            switch (this.matrixType) {
                case WORD_WORD: {
                    HashSet<String> words = new HashSet<String>();
                    words.addAll(Arrays.asList(doc.get(field).split("\\s")));
                    String[] wordsDoc = words.toArray(new String[words.size()]);
                    for (int i = 0; i < wordsDoc.length; ++i) {
                        for (int j = i + 1; j < wordsDoc.length; ++j) {
                            if (this.vocabulary != null && (!this.vocabulary.contains(wordsDoc[i]) || !this.vocabulary.contains(wordsDoc[j]))) continue;
                            this.matrix.addValue(wordsDoc[i], wordsDoc[j], 1.0);
                            this.matrix.addValue(wordsDoc[j], wordsDoc[i], 1.0);
                        }
                    }
                    continue block4;
                }
                case WORD_DOC: {
                    HashSet<String> wordsCol = new HashSet<String>();
                    wordsCol.addAll(Arrays.asList(doc.get(field).split("\\s")));
                    String[] wordsArray = wordsCol.toArray(new String[wordsCol.size()]);
                    for (int i = 0; i < wordsArray.length; ++i) {
                        this.matrix.addValue(wordsArray[i], doc, 1.0);
                    }
                    break;
                }
            }
        }
    }

    public static void main(String[] args) throws SLIB_Ex_Critic, IOException {
        String[] ext = new String[]{"txt"};
        List<File> files = FileUtils.listFilesForFolder("/data/tmp/wiki/", Arrays.asList(ext), 100000);
        ArrayList<String> docField = new ArrayList<String>();
        docField.add("content");
        String[] vocArray = new String[]{"lion", "panthera", "Africa", "lamb", "insecticides", "animal", "Genealogists", "rugby", "football", "Sydney", "Australia"};
        HashSet<String> voc = new HashSet<String>(Arrays.asList(vocArray));
        MatrixType matrixType = MatrixType.WORD_WORD;
        VocContextMatrixBuilder matrixBuilder = new VocContextMatrixBuilder(matrixType, voc, docField);
        for (File f : files) {
            Document doc = new Document();
            String fileAsString = FileUtils.readFile(f.getAbsolutePath(), Charset.defaultCharset());
            doc.add(new StoredField("content", fileAsString));
            matrixBuilder.process(doc);
        }
        Matrix mat = matrixBuilder.getMatrix();
        System.out.println("size: " + mat.getInternalStorage().keySet().size());
        for (String s : mat.getInternalStorage().keySet()) {
            System.out.println(s + "\t(" + mat.getInternalStorage().get(s).size() + ")\t" + mat.getInternalStorage().get(s));
        }
    }
}

