diff --git a/libdynet.cpp b/libdynet.cpp index e5b9f0db2..de0a3e201 100644 --- a/libdynet.cpp +++ b/libdynet.cpp @@ -1,6 +1,12 @@ +//#include +#define FMT_HEADER_ONLY 1 +#include #include +#include #include +#include + using namespace std; #include @@ -372,11 +378,11 @@ REFL_END void trainmain(); -static ParameterCollection model; -size_t BATCH_SIZE=500; -static ComputationGraph cg; -static vector batch(BATCH_SIZE); + +size_t BATCH_SIZE=1000; +static vector> batch(BATCH_SIZE); static int next_id = 0; +static int batch_id = 0; void ggml_tensor_add(const char * name,const struct ggml_tensor * tensor); #include @@ -395,12 +401,33 @@ void ggml_tensor_add(const char * name,const struct ggml_tensor * tensor){ float* buffer = ggml_get_data_f32(tensor); //Expression x = input(cg, buffer); // runtime2::debug(std::cout,x); - Tensor eigen_tensor(Dim({num_elements},1),buffer, nullptr,DeviceMempool::NONE); - // Create a copy of the eigen tensor - // Tensor eigen_tensor_copy = eigen_tensor; - batch[(next_id++) % BATCH_SIZE] = eigen_tensor; + std::vector values; - trainmain(); + // copy the elements in + std::copy(buffer, &buffer[num_elements], back_inserter(values)); + + batch[(next_id++) % BATCH_SIZE] = values; + + if ((next_id) % BATCH_SIZE == 0) + { + batch_id ++; + ofstream data_file; // pay attention here! ofstream + + data_file.open(fmt::format("batch{}.bin", batch_id), ios::out | ios::binary); + + for (auto &row: batch) { + + auto bsize = row.size(); + data_file.write(reinterpret_cast(&bsize),4); + + for (auto &cell: row) { + data_file.write(reinterpret_cast(&cell), 4); + } + } + data_file.close(); + } + + //trainmain(); //runtime2::debug(std::cout,batch); } @@ -425,43 +452,256 @@ int ITERATIONS = 5; void trainmain() { - char** argv = 0; - //= {""}; - int argc = 0; - dynet::initialize(argc,argv); - static SimpleSGDTrainer trainer(model); -Parameter p_W = model.add_parameters({HIDDEN_SIZE, 2}); -Parameter p_b = model.add_parameters({HIDDEN_SIZE}); -Parameter p_V = model.add_parameters({1, HIDDEN_SIZE}); -Parameter p_a = model.add_parameters({1}); +// char** argv = 0; +// //= {""}; +// int argc = 0; +// dynet::initialize(argc,argv); +// static SimpleSGDTrainer trainer(model); +// Parameter p_W = model.add_parameters({HIDDEN_SIZE, 2}); +// Parameter p_b = model.add_parameters({HIDDEN_SIZE}); +// Parameter p_V = model.add_parameters({1, HIDDEN_SIZE}); +// Parameter p_a = model.add_parameters({1}); -Expression W = parameter(cg, p_W); -Expression b = parameter(cg, p_b); -Expression V = parameter(cg, p_V); -Expression a = parameter(cg, p_a); +// Expression W = parameter(cg, p_W); +// Expression b = parameter(cg, p_b); +// Expression V = parameter(cg, p_V); +// Expression a = parameter(cg, p_a); - // Train the parameters. - for (unsigned iter = 0; iter < ITERATIONS; ++iter) { - double loss = 0; - for (unsigned mi = 0; mi < BATCH_SIZE; ++mi) { +// // Train the parameters. +// for (unsigned iter = 0; iter < ITERATIONS; ++iter) { +// double loss = 0; +// for (unsigned mi = 0; mi < BATCH_SIZE; ++mi) { - auto x_values = batch[mi]; - auto y_value = x_values.batch_ptr(0); +// auto x_values = batch[mi]; +// //auto y_value = x_values.batch_ptr(0); - Expression y = input(cg, y_value); +// Expression y = input(cg, y_value); - Expression x = input(cg, x_values.batch_ptr(0)); - Expression h = tanh(W*x + b); - Expression y_pred = V*h + a; - Expression loss_expr = squared_distance(y_pred, y); +// Expression x = input(cg, x_values.batch_ptr(0)); +// Expression h = tanh(W*x + b); +// Expression y_pred = V*h + a; +// Expression loss_expr = squared_distance(y_pred, y); - loss += as_scalar(cg.forward(loss_expr)); - cg.backward(loss_expr); - trainer.update(); - } - loss /= 4; - cerr << "E = " << loss << endl; - } +// loss += as_scalar(cg.forward(loss_expr)); +// cg.backward(loss_expr); +// trainer.update(); +// } +// loss /= 4; +// cerr << "E = " << loss << endl; +// } } + + + + +#include +#include +#include +#include +#ifdef BOOST_REGEX + #include + using namespace boost; +#else + #include +#endif + +#include +#include +#include +#include + +using namespace std; +using namespace std::chrono; +using namespace dynet; + +// Read a file where each line is of the form "word1|tag1 word2|tag2 ..." +// Yields pairs of lists of the form < [word1, word2, ...], [tag1, tag2, ...] > +vector, vector > > read(const string & fname) { + ifstream fh(fname); + if(!fh) throw std::runtime_error("Could not open file"); + string str; + regex re("[ |]"); + vector, vector > > sents; + while(getline(fh, str)) { + pair,vector > word_tags; + sregex_token_iterator first{str.begin(), str.end(), re, -1}, last; + while(first != last) { + word_tags.first.push_back(*first++); + assert(first != last); + word_tags.second.push_back(*first++); + } + sents.push_back(word_tags); + } + return sents; +} + +class BiLSTMTagger { +public: + + BiLSTMTagger(unsigned layers, unsigned wembed_dim, unsigned hidden_dim, unsigned mlp_dim, ParameterCollection & model, Dict & wv, Dict & tv, unordered_map & wc) + : wv(wv), tv(tv), wc(wc) { + unsigned nwords = wv.size(); + unsigned ntags = tv.size(); + word_lookup = model.add_lookup_parameters(nwords, {wembed_dim}); + + // MLP on top of biLSTM outputs 100 -> 32 -> ntags + pH = model.add_parameters({mlp_dim, hidden_dim*2}); + pO = model.add_parameters({ntags, mlp_dim}); + + // word-level LSTMs + fwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model); // layers, in-dim, out-dim, model + bwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model); + } + + Dict &wv, &tv; + unordered_map & wc; + LookupParameter word_lookup; + Parameter pH, pO; + VanillaLSTMBuilder fwdRNN, bwdRNN; + + // Do word representation + Expression word_rep(ComputationGraph & cg, const string & w) { + return lookup(cg, word_lookup, wv.convert(wc[w] > 5 ? w : "")); + } + + vector build_tagging_graph(ComputationGraph & cg, const vector & words) { + // parameters -> expressions + Expression H = parameter(cg, pH); + Expression O = parameter(cg, pO); + + // initialize the RNNs + fwdRNN.new_graph(cg); + bwdRNN.new_graph(cg); + + // get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. + vector wembs(words.size()), fwds(words.size()), bwds(words.size()), fbwds(words.size()); + for(size_t i = 0; i < words.size(); ++i) + wembs[i] = word_rep(cg, words[i]); + + // feed word vectors into biLSTM + fwdRNN.start_new_sequence(); + for(size_t i = 0; i < wembs.size(); ++i) + fwds[i] = fwdRNN.add_input(wembs[i]); + bwdRNN.start_new_sequence(); + for(size_t i = wembs.size(); i > 0; --i) + bwds[i-1] = bwdRNN.add_input(wembs[i-1]); + + // Concatenate and MLP + for(size_t i = 0; i < wembs.size(); ++i) + fbwds[i] = O * tanh( H * concatenate({fwds[i], bwds[i]}) ); + + return fbwds; + } + + Expression sent_loss(ComputationGraph & cg, vector & words, vector & tags) { + vector exprs = build_tagging_graph(cg, words), errs(words.size()); + for(size_t i = 0; i < tags.size(); ++i) + errs[i] = pickneglogsoftmax(exprs[i], tv.convert(tags[i])); + return sum(errs); + } + + vector tag_sent(vector & words) { + ComputationGraph cg; + vector exprs = build_tagging_graph(cg, words), errs(words.size()); + vector tags(words.size()); + for(size_t i = 0; i < words.size(); ++i) { + vector scores = as_vector(exprs[i].value()); + size_t max_id = distance(scores.begin(), max_element(scores.begin(), scores.end())); + tags[i] = tv.convert(max_id); + } + return tags; + } + +}; + +int othermain() { + int argc=0; + char**argv=0; + time_point start = system_clock::now(); + + vector, vector > > train = read("data/tags/train.txt"); + vector, vector > > dev = read("data/tags/dev.txt"); + Dict word_voc, tag_voc; + unordered_map word_cnt; + for(auto & sent : train) { + for(auto & w : sent.first) { + word_voc.convert(w); + word_cnt[w]++; + } + for(auto & t : sent.second) + tag_voc.convert(t); + } + tag_voc.freeze(); + word_voc.convert(""); word_voc.freeze(); word_voc.set_unk(""); + + // DyNet Starts + dynet::initialize(argc, argv); + ParameterCollection model; + AdamTrainer trainer(model); + trainer.clipping_enabled = false; + + if(argc != 6) { + cerr << "Usage: " << argv[0] << " WEMBED_SIZE HIDDEN_SIZE MLP_SIZE SPARSE TIMEOUT" << endl; + return 1; + } + int WEMBED_SIZE = atoi(argv[1]); + int HIDDEN_SIZE = atoi(argv[2]); + int MLP_SIZE = atoi(argv[3]); + trainer.sparse_updates_enabled = atoi(argv[4]); + int TIMEOUT = atoi(argv[5]); + + // Initilaize the tagger + BiLSTMTagger tagger(1, WEMBED_SIZE, HIDDEN_SIZE, MLP_SIZE, model, word_voc, tag_voc, word_cnt); + + { + duration fs = (system_clock::now() - start); + float startup_time = duration_cast(fs).count() / float(1000); + cout << "startup time: " << startup_time << endl; + } + + // Do training + start = system_clock::now(); + int i = 0, all_tagged = 0, this_words = 0; + float this_loss = 0.f, all_time = 0.f; + for(int iter = 0; iter < 100; iter++) { + shuffle(train.begin(), train.end(), *dynet::rndeng); + for(auto & s : train) { + i++; + if(i % 500 == 0) { + trainer.status(); + cout << this_loss/this_words << endl; + all_tagged += this_words; + this_loss = 0.f; + this_words = 0; + } + if(i % 10000 == 0) { + duration fs = (system_clock::now() - start); + all_time += duration_cast(fs).count() / float(1000); + int dev_words = 0, dev_good = 0; + float dev_loss = 0; + for(auto & sent : dev) { + vector tags = tagger.tag_sent(sent.first); + for(size_t j = 0; j < tags.size(); ++j) + if(tags[j] == sent.second[j]) + dev_good++; + dev_words += sent.second.size(); + } + cout << "acc=" << dev_good/float(dev_words) << ", time=" << all_time << ", word_per_sec=" << all_tagged/all_time << endl; + if(all_time > TIMEOUT) + exit(0); + start = system_clock::now(); + } + + ComputationGraph cg; + Expression loss_exp = tagger.sent_loss(cg, s.first, s.second); + float my_loss = as_scalar(cg.forward(loss_exp)); + this_loss += my_loss; + this_words += s.first.size(); + cg.backward(loss_exp); + trainer.update(); + } + } + return 0; +}