I'm going to use GloVe with Deeplearning4j, a Java library.
Please prepare the corpus you want to study in advance. In the case of a Japanese corpus, write it in separate words. When writing in separate words, it may be better to change the verbs to the basic form (original form).
Keep the corpus text file as ** input.txt **. Save the created model as ** model.txt **.
ModelBuild.java
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.glove.Glove;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import java.io.*;
public class ModelBuild {
    public static void main( String[] args ) throws Exception{
    	
     	//Read corpus file
    	System.out.println("Reading data...");
        File inputFile = new File("input.txt");
    	
        //Read as text data class
        SentenceIterator iter = new BasicLineIterator(inputFile);
        
        //Create a tokenizer (word split) class
        System.out.println("Create a tokenizer...");
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());
        
        //Creating a model
        System.out.println("Creating a model...");
        Glove glove = new Glove.Builder()
        		.iterate(iter) //Sentence data class
        		.tokenizerFactory(t) //Word decomposition class
        		.alpha(0.75) //Parameters in the exponent of the weighting function
        		.learningRate(0.1) //Initial learning rate
        		.epochs(25) //Number of iterations on the training corpus during training
        		.layerSize(300) //Number of dimensions of vector
        		.maxMemory(2) //Maximum memory usage
        		.xMax(100) //Weight function cutoff
        		.batchSize(1000) //Number of words to learn in one mini-batch
        		.windowSize(10) //Window size
        		.shuffle(true)
        		.symmetric(true)
        		.build();
        
        //Learning
        System.out.println("I'm learning...");
        glove.fit();
        
        //Save model
        System.out.println("Saving the model...");
        WordVectorSerializer.writeWordVectors(glove, "model.txt");
        
        System.out.println("The program is over");
    }
}
Evaluation.java
import java.io.File;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.embeddings.wordvectors.WordVectors;
public class Evaluation {
	public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
		//Load model file
    	System.out.println("Loading model file...");
    	File inputFile = new File(args[0]);
    	WordVectors vec = WordVectorSerializer.loadTxtVectors(inputFile);
    	
    	//Display the top 10 similar words for the word (for example, "weather")
    	System.out.println("Top 10 similar words...");
    	String  word        = "weather";
        int     ranking     = 10;
        Collection<String>  similarTop10    = vec.wordsNearest( word , ranking );
        System.out.println( String.format( "Similar word to 「%s」 is %s" , word , similarTop10 ) );
        
        //Show cosine similarity (eg "sunny" and "rain")
        System.out.println( "Show cosine similarity..." );
        String  word1       = "Sunny";
        String  word2       = "rain";
        double  similarity  = vec.similarity( word1 , word2 );
        System.out.println( String.format( "The similarity between 「%s」 and 「%s」 is %f" , word1 , word2 , similarity ) );
	}
}
Recommended Posts