gensim.models.doc2vec.TaggedDocument

Here are the examples of the python api gensim.models.doc2vec.TaggedDocument taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

5 Examples 7

Example 1

Project: gensim
License: View license
Source File: test_doc2vec.py
    def test_mixed_tag_types(self):
        """Ensure alternating int/string tags don't share indexes in doctag_syn0"""
        mixed_tag_corpus = [doc2vec.TaggedDocuement(words, [i, words[0]]) for i, words in enumerate(raw_sentences)]
        model = doc2vec.Doc2Vec()
        model.build_vocab(mixed_tag_corpus)
        expected_length = len(sentences) + len(model.docvecs.doctags)  # 9 sentences, 7 unique first tokens
        self.assertEquals(len(model.docvecs.doctag_syn0), expected_length)

Example 2

Project: gensim
License: View license
Source File: test_doc2vec.py
Function: testbuildvocabwarning
    @log_capture()
    def testBuildVocabWarning(self, l):
        """Test if logger warning is raised on non-ideal input to a doc2vec model"""
        raw_sentences = ['human', 'machine']
        sentences = [doc2vec.TaggedDocuement(words, [i]) for i, words in enumerate(raw_sentences)]
        model = doc2vec.Doc2Vec()
        model.build_vocab(sentences)
        warning = "Each 'words' should be a list of words (usually unicode strings)."
        self.assertTrue(warning in str(l))

Example 3

Project: gensim
License: View license
Source File: test_doc2vec.py
Function: testtrainwarning
    @log_capture()
    def testTrainWarning(self, l):
        """Test if warning is raised if alpha rises during subsequent calls to train()"""
        raw_sentences = [['human'],
                         ['graph', 'trees']]
        sentences = [doc2vec.TaggedDocuement(words, [i]) for i, words in enumerate(raw_sentences)]
        model = doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025, min_count=1, workers=8, size=5)
        model.build_vocab(sentences)
        for epoch in range(10):
            model.train(sentences)
            model.alpha -= 0.002
            model.min_alpha = model.alpha
            if epoch == 5:
                model.alpha += 0.05
        warning = "Effective 'alpha' higher than previous training cycles"
        self.assertTrue(warning in str(l))

Example 4

Project: VDiscover
License: View license
Source File: Cluster.py
def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples, vectorizer, reducer, param):

  train_programs, train_features, train_classes = read_traces(train_file, nsamples)
  train_size = len(train_programs)
  print "using", train_size,"examples to train."

  if vectorizer == "bow":
 
    train_dict = dict()
    train_dict[ftype] = train_features
    #batch_size = 16
    #window_size = 20

    print "Transforming data and fitting model.."
    model = make_cluster_pipeline_bow(ftype, reducer)
    X_red = model.fit_transform(train_dict)

  elif vectorizer == "doc2vec":

    from gensim.models.doc2vec import TaggedDocuement
    from gensim.models import Doc2Vec

    print "Vectorizing traces.."
    sentences = []
  
    for (prog,trace) in zip(train_programs,train_features):
      sentences.append(TaggedDocuement(trace.split(" "), [prog]))

    model = Doc2Vec(dm=2, min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=8, iter=1)
    model.build_vocab(sentences)

    for epoch in range(20):
      #print model
      model.train(sentences)
      shuffle(sentences)

    train_dict = dict()

    vec_train_features = []
    for prog in train_programs:
      #print prog, model.docvecs[prog]
      vec_train_features.append(model.docvecs[prog])

    train_dict[ftype] = vec_train_features

    print "Transforming data and fitting model.."
    model = make_cluster_pipeline_doc2vec(ftype, reducer)
    X_red = model.fit_transform(train_dict)


  #pl.rcParams.update({'font.size': 10})
  if type(X_red) == list:
    X_red = np.vstack(X_red)
    print X_red.shape 

  if X_red.shape[1] == 2:

    plt.figure()
    colors = 'brgcmykbgrcmykbgrcmykbgrcmyk'
    ncolors = len(colors)

    for prog,[x,y],cl in zip(train_programs, X_red, train_classes):
      x = gauss(0,0.1) + x
      y = gauss(0,0.1) + y
      try:
          plt.scatter(x, y, c=colors[int(cl)])
          plt.text(x, y+0.02, prog.split("/")[-1])
      except ValueError:
          plt.text(x, y+0.02, cl)
     
   

    if valid_file is not None:
      valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
      valid_dict = dict()
      valid_dict[ftype] = valid_features

      X_red = model.transform(valid_dict)
      for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
        x = gauss(0,0.1) + x
        y = gauss(0,0.1) + y
        plt.scatter(x, y, c=colors[cl+1])
        plt.text(x, y+0.02, prog.split("/")[-1])

    #plt.show()
    plt.savefig(train_file.replace(".gz","")+".png")


  from sklearn.cluster import MeanShift, estimate_bandwidth

  bandwidth = estimate_bandwidth(X_red, quantile=0.2)
  print "Clustering with bandwidth:", bandwidth

  af = MeanShift(bandwidth=bandwidth*param).fit(X_red)

  cluster_centers = af.cluster_centers_
  labels = af.labels_
  n_clusters_ = len(cluster_centers)

  if X_red.shape[1] == 2:

    plt.close('all')
    plt.figure(1)
    plt.clf()

    for ([x,y],label, cluster_label) in zip(X_red,train_programs, labels):
      x = gauss(0,0.1) + x
      y = gauss(0,0.1) + y
      plt.scatter(x, y, c = colors[cluster_label % ncolors])

    for i,[x,y] in enumerate(cluster_centers):
      plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors],
               markeredgecolor='k', markersize=7)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.savefig(train_file.replace(".gz","")+".clusters.png")

  #plt.show()

  clustered_traces = zip(train_programs, labels)
  writer = write_csv(train_file.replace(".gz","")+".clusters")
  for label, cluster in clustered_traces:
     writer.writerow([label.split("/")[-1], cluster])

Example 5

Project: gensim
License: View license
Source File: test_doc2vec.py
    def __iter__(self):
        with open(datapath('lee_background.cor')) as f:
            for i, line in enumerate(f):
                yield doc2vec.TaggedDocuement(utils.simple_preprocess(line), [self._tag(i)])