fastText  d00d36476b15
Fast text processing tool/library
dictionary.h
Go to the documentation of this file.
1 
10 #ifndef FASTTEXT_DICTIONARY_H
11 #define FASTTEXT_DICTIONARY_H
12 
13 #include <vector>
14 #include <string>
15 #include <istream>
16 #include <ostream>
17 #include <random>
18 #include <memory>
19 #include <unordered_map>
20 
21 #include "args.h"
22 #include "real.h"
23 
24 namespace fasttext {
25 
26 typedef int32_t id_type;
27 enum class entry_type : int8_t {word=0, label=1};
28 
29 struct entry {
30  std::string word;
31  int64_t count;
33  std::vector<int32_t> subwords;
34 };
35 
36 class Dictionary {
37  private:
38  static const int32_t MAX_VOCAB_SIZE = 30000000;
39  static const int32_t MAX_LINE_SIZE = 1024;
40 
41  int32_t find(const std::string&) const;
42  void initTableDiscard();
43  void initNgrams();
44 
45  std::shared_ptr<Args> args_;
46  std::vector<int32_t> word2int_;
47  std::vector<entry> words_;
48 
49  std::vector<real> pdiscard_;
50  int32_t size_;
51  int32_t nwords_;
52  int32_t nlabels_;
53  int64_t ntokens_;
54 
55  int64_t pruneidx_size_ = -1;
56  std::unordered_map<int32_t, int32_t> pruneidx_;
57  void addWordNgrams(
58  std::vector<int32_t>& line,
59  const std::vector<int32_t>& hashes,
60  int32_t n) const;
61 
62 
63  public:
64  static const std::string EOS;
65  static const std::string BOW;
66  static const std::string EOW;
67 
68  explicit Dictionary(std::shared_ptr<Args>);
69  int32_t nwords() const;
70  int32_t nlabels() const;
71  int64_t ntokens() const;
72  int32_t getId(const std::string&) const;
73  entry_type getType(int32_t) const;
74  entry_type getType(const std::string&) const;
75  bool discard(int32_t, real) const;
76  std::string getWord(int32_t) const;
77  const std::vector<int32_t>& getSubwords(int32_t) const;
78  const std::vector<int32_t> getSubwords(const std::string&) const;
79  void computeSubwords(const std::string&, std::vector<int32_t>&) const;
80  void computeSubwords(
81  const std::string&,
82  std::vector<int32_t>&,
83  std::vector<std::string>&) const;
84  void getSubwords(
85  const std::string&,
86  std::vector<int32_t>&,
87  std::vector<std::string>&) const;
88  uint32_t hash(const std::string& str) const;
89  void add(const std::string&);
90  bool readWord(std::istream&, std::string&) const;
91  void readFromFile(std::istream&);
92  std::string getLabel(int32_t) const;
93  void save(std::ostream&) const;
94  void load(std::istream&);
95  std::vector<int64_t> getCounts(entry_type) const;
96  int32_t getLine(std::istream&, std::vector<int32_t>&, std::vector<int32_t>&,
97  std::vector<int32_t>&, std::minstd_rand&) const;
98  int32_t getLine(std::istream&, std::vector<int32_t>&,
99  std::vector<int32_t>&, std::minstd_rand&) const;
100  void threshold(int64_t, int64_t);
101  void prune(std::vector<int32_t>&);
102 };
103 
104 }
105 
106 #endif
std::unordered_map< int32_t, int32_t > pruneidx_
Definition: dictionary.h:56
Definition: dictionary.h:36
std::string word
Definition: dictionary.h:30
std::vector< real > pdiscard_
Definition: dictionary.h:49
static const std::string EOW
Definition: dictionary.h:66
int32_t nwords_
Definition: dictionary.h:51
Definition: args.cc:17
static const std::string BOW
Definition: dictionary.h:65
int32_t nlabels_
Definition: dictionary.h:52
entry_type type
Definition: dictionary.h:32
int32_t id_type
Definition: dictionary.h:26
std::shared_ptr< Args > args_
Definition: dictionary.h:45
std::vector< int32_t > word2int_
Definition: dictionary.h:46
std::vector< int32_t > subwords
Definition: dictionary.h:33
Definition: dictionary.h:29
entry_type
Definition: dictionary.h:27
int64_t count
Definition: dictionary.h:31
float real
Definition: real.h:15
int64_t ntokens_
Definition: dictionary.h:53
int32_t size_
Definition: dictionary.h:50
static const std::string EOS
Definition: dictionary.h:64
std::vector< entry > words_
Definition: dictionary.h:47