Skip to content

Commit aaee2e8

Browse files
No public description
PiperOrigin-RevId: 873193850
1 parent 9d9a3f3 commit aaee2e8

File tree

7 files changed

+109
-106
lines changed

7 files changed

+109
-106
lines changed

tensorflow_text/core/kernels/spanning_tree_iterator.cc

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,15 @@ bool SpanningTreeIterator::HasCycle(const SourceList &sources) {
2727
visiting_.assign(sources.size(), false);
2828

2929
// Search upwards from each node to find cycles.
30-
for (uint32 initial_node = 0; initial_node < sources.size(); ++initial_node) {
30+
for (uint32_t initial_node = 0; initial_node < sources.size();
31+
++initial_node) {
3132
// Search upwards to try to find a cycle.
32-
uint32 current_node = initial_node;
33+
uint32_t current_node = initial_node;
3334
while (true) {
3435
if (searched_[current_node]) break; // already searched
3536
if (visiting_[current_node]) return true; // revisiting implies cycle
3637
visiting_[current_node] = true; // mark as being currently visited
37-
const uint32 source_node = sources[current_node];
38+
const uint32_t source_node = sources[current_node];
3839
if (source_node == current_node) break; // self-loops are roots
3940
current_node = source_node; // advance upwards
4041
}
@@ -45,7 +46,7 @@ bool SpanningTreeIterator::HasCycle(const SourceList &sources) {
4546
if (searched_[current_node]) break; // already searched
4647
searched_[current_node] = true;
4748
visiting_[current_node] = false;
48-
const uint32 source_node = sources[current_node];
49+
const uint32_t source_node = sources[current_node];
4950
if (source_node == current_node) break; // self-loops are roots
5051
current_node = source_node; // advance upwards
5152
}
@@ -54,18 +55,18 @@ bool SpanningTreeIterator::HasCycle(const SourceList &sources) {
5455
return false;
5556
}
5657

57-
uint32 SpanningTreeIterator::NumRoots(const SourceList &sources) {
58-
uint32 num_roots = 0;
59-
for (uint32 node = 0; node < sources.size(); ++node) {
58+
uint32_t SpanningTreeIterator::NumRoots(const SourceList& sources) {
59+
uint32_t num_roots = 0;
60+
for (uint32_t node = 0; node < sources.size(); ++node) {
6061
num_roots += (node == sources[node]);
6162
}
6263
return num_roots;
6364
}
6465

6566
bool SpanningTreeIterator::NextSourceList(SourceList *sources) {
66-
const uint32 num_nodes = sources->size();
67-
for (uint32 i = 0; i < num_nodes; ++i) {
68-
const uint32 new_source = ++(*sources)[i];
67+
const uint32_t num_nodes = sources->size();
68+
for (uint32_t i = 0; i < num_nodes; ++i) {
69+
const uint32_t new_source = ++(*sources)[i];
6970
if (new_source < num_nodes) return true; // absorbed in this digit
7071
(*sources)[i] = 0; // overflowed this digit, carry to next digit
7172
}
@@ -76,7 +77,7 @@ bool SpanningTreeIterator::NextTree(SourceList *sources) {
7677
// Iterate source lists, skipping non-trees.
7778
while (NextSourceList(sources)) {
7879
// Check the number of roots.
79-
const uint32 num_roots = NumRoots(*sources);
80+
const uint32_t num_roots = NumRoots(*sources);
8081
if (forest_) {
8182
if (num_roots == 0) continue;
8283
} else {

tensorflow_text/core/kernels/spanning_tree_iterator.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class SpanningTreeIterator {
3131
public:
3232
// An array that provides the source of the inbound arc for each node. Roots
3333
// are represented as self-loops.
34-
using SourceList = std::vector<uint32>;
34+
using SourceList = std::vector<uint32_t>;
3535

3636
// Creates a spanning tree iterator. If |forest| is true, then this iterates
3737
// over forests instead of trees (i.e., multiple roots are allowed).
@@ -41,7 +41,7 @@ class SpanningTreeIterator {
4141
// true) of a complete digraph containing |num_nodes| nodes. Each tree is
4242
// passed to the |functor| as a SourceList.
4343
template <class Functor>
44-
void ForEachTree(uint32 num_nodes, Functor functor) {
44+
void ForEachTree(uint32_t num_nodes, Functor functor) {
4545
// Conveniently, the all-zero vector represents a valid tree.
4646
SourceList sources(num_nodes, 0);
4747
do {
@@ -54,7 +54,7 @@ class SpanningTreeIterator {
5454
bool HasCycle(const SourceList &sources);
5555

5656
// Returns the number of roots in the |sources|.
57-
static uint32 NumRoots(const SourceList &sources);
57+
static uint32_t NumRoots(const SourceList& sources);
5858

5959
// Advances |sources| to the next source list, or returns false if there are
6060
// no more source lists.

tensorflow_text/core/kernels/split_merge_tokenize_kernel.cc

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -65,31 +65,31 @@ bool IsBreakChar(absl::string_view text) {
6565
return u_isUWhiteSpace(c);
6666
}
6767

68-
Status TokenizeByLabel(const absl::string_view& text,
69-
const Tensor& labels_tensor,
70-
bool force_split_at_break_character,
71-
std::vector<std::string>* tokens,
72-
std::vector<int>* begin_offset,
73-
std::vector<int>* end_offset, int* num_tokens) {
68+
absl::Status TokenizeByLabel(const absl::string_view& text,
69+
const Tensor& labels_tensor,
70+
bool force_split_at_break_character,
71+
std::vector<std::string>* tokens,
72+
std::vector<int>* begin_offset,
73+
std::vector<int>* end_offset, int* num_tokens) {
7474
std::vector<absl::string_view> chars;
7575
if (!GetUTF8Chars(text, &chars)) {
76-
return Status(static_cast<::absl::StatusCode>(
77-
absl::StatusCode::kInvalidArgument),
78-
absl::StrCat("Input string is not utf8 valid: ", text));
76+
return absl::Status(
77+
static_cast<::absl::StatusCode>(absl::StatusCode::kInvalidArgument),
78+
absl::StrCat("Input string is not utf8 valid: ", text));
7979
}
8080

8181
if (chars.size() > labels_tensor.dim_size(0)) {
82-
return Status(static_cast<::absl::StatusCode>(
83-
absl::StatusCode::kInvalidArgument),
84-
absl::StrCat("Number of labels ", labels_tensor.dim_size(0),
85-
" is insufficient for text ", text));
82+
return absl::Status(
83+
static_cast<::absl::StatusCode>(absl::StatusCode::kInvalidArgument),
84+
absl::StrCat("Number of labels ", labels_tensor.dim_size(0),
85+
" is insufficient for text ", text));
8686
}
8787

8888
const int split_label = 0;
8989
bool last_character_is_break_character = false;
9090
int start = 0;
9191
bool has_new_token_generated_for_text = false;
92-
const auto& labels = labels_tensor.unaligned_flat<int32>();
92+
const auto& labels = labels_tensor.unaligned_flat<int32_t>();
9393
for (int i = 0; i < chars.size(); ++i) {
9494
const bool is_break_character = IsBreakChar(chars[i]);
9595
if (!is_break_character) {
@@ -138,14 +138,14 @@ class SplitMergeTokenizeWithOffsetsOp : public OpKernel {
138138
" elements, got ",
139139
row_splits->dim_size(0)));
140140

141-
std::vector<string> tokens;
141+
std::vector<std::string> tokens;
142142
std::vector<int> begin_offset;
143143
std::vector<int> end_offset;
144144
std::vector<int> output_row_splits(1, 0);
145145

146146
// Iterate through all the values and tokenize them.
147147
const auto& values_vec = input_values->flat<tstring>();
148-
const auto& row_splits_vec = row_splits->flat<int32>();
148+
const auto& row_splits_vec = row_splits->flat<int32_t>();
149149
for (int i = 0; i < values_vec.size(); ++i) {
150150
// Tokenize into tokens and record the offset locations.
151151
int num_tokens = 0;
@@ -160,10 +160,10 @@ class SplitMergeTokenizeWithOffsetsOp : public OpKernel {
160160
output_row_splits.push_back(num_tokens + output_row_splits.back());
161161
}
162162

163-
std::vector<int64> output_tokens_shape;
163+
std::vector<int64_t> output_tokens_shape;
164164
output_tokens_shape.push_back(tokens.size());
165165

166-
std::vector<int64> output_row_splits_shape;
166+
std::vector<int64_t> output_row_splits_shape;
167167
output_row_splits_shape.push_back(output_row_splits.size());
168168

169169
Tensor* output_values;
@@ -177,19 +177,19 @@ class SplitMergeTokenizeWithOffsetsOp : public OpKernel {
177177
ctx->allocate_output("output_row_splits",
178178
TensorShape(output_row_splits_shape),
179179
&output_row_splits_tensor));
180-
auto output_row_splits_vec = output_row_splits_tensor->vec<int64>();
180+
auto output_row_splits_vec = output_row_splits_tensor->vec<int64_t>();
181181

182182
Tensor* start_values;
183183
OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values",
184184
TensorShape(output_tokens_shape),
185185
&start_values));
186-
auto start_values_vec = start_values->vec<int64>();
186+
auto start_values_vec = start_values->vec<int64_t>();
187187

188188
Tensor* limit_values;
189189
OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values",
190190
TensorShape(output_tokens_shape),
191191
&limit_values));
192-
auto limit_values_vec = limit_values->vec<int64>();
192+
auto limit_values_vec = limit_values->vec<int64_t>();
193193

194194
for (int i = 0; i < tokens.size(); ++i) {
195195
output_values_vec(i) = tokens[i];

tensorflow_text/core/kernels/tokenizer_from_logits_kernel.cc

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -68,22 +68,22 @@ bool IsBreakChar(absl::string_view text) {
6868
// allows us to retrieve the corresponding data from logits. I.e., the logits
6969
// for the i-th character from text are logits(batch_index, i, 0) (for the
7070
// "split" action) and logits(batch_index, i, 1) (for the "merge" action).
71-
Status TokenizeByLogits(const absl::string_view& text,
72-
const TTypes<const float, 3>::Tensor& logits,
73-
int batch_index,
74-
bool force_split_at_break_character,
75-
std::vector<std::string>* tokens,
76-
std::vector<int>* begin_offset,
77-
std::vector<int>* end_offset, int* num_tokens) {
71+
absl::Status TokenizeByLogits(const absl::string_view& text,
72+
const TTypes<const float, 3>::Tensor& logits,
73+
int batch_index,
74+
bool force_split_at_break_character,
75+
std::vector<std::string>* tokens,
76+
std::vector<int>* begin_offset,
77+
std::vector<int>* end_offset, int* num_tokens) {
7878
std::vector<absl::string_view> chars;
7979
if (!GetUTF8Chars(text, &chars)) {
80-
return Status(
80+
return absl::Status(
8181
static_cast<absl::StatusCode>(absl::StatusCode::kInvalidArgument),
8282
absl::StrCat("Input string is not utf8 valid: ", text));
8383
}
8484

8585
if (chars.size() > logits.dimension(1)) {
86-
return Status(
86+
return absl::Status(
8787
static_cast<absl::StatusCode>(absl::StatusCode::kInvalidArgument),
8888
absl::StrCat("Number of logits, ", logits.dimension(1),
8989
", is insufficient for text \"", text, "\""));
@@ -142,7 +142,7 @@ class TokenizerFromLogitsOp : public OpKernel {
142142
const bool force_split_at_break_character_bool =
143143
force_split_at_break_character->scalar<bool>()();
144144

145-
std::vector<string> tokens;
145+
std::vector<std::string> tokens;
146146
std::vector<int> begin_offset;
147147
std::vector<int> end_offset;
148148
std::vector<int> output_row_splits(1, 0);
@@ -175,10 +175,10 @@ class TokenizerFromLogitsOp : public OpKernel {
175175
output_row_splits.push_back(num_tokens + output_row_splits.back());
176176
}
177177

178-
std::vector<int64> output_tokens_shape;
178+
std::vector<int64_t> output_tokens_shape;
179179
output_tokens_shape.push_back(tokens.size());
180180

181-
std::vector<int64> output_row_splits_shape;
181+
std::vector<int64_t> output_row_splits_shape;
182182
output_row_splits_shape.push_back(output_row_splits.size());
183183

184184
Tensor* output_values;
@@ -192,19 +192,19 @@ class TokenizerFromLogitsOp : public OpKernel {
192192
ctx->allocate_output("row_splits",
193193
TensorShape(output_row_splits_shape),
194194
&output_row_splits_tensor));
195-
auto output_row_splits_vec = output_row_splits_tensor->vec<int64>();
195+
auto output_row_splits_vec = output_row_splits_tensor->vec<int64_t>();
196196

197197
Tensor* start_values;
198198
OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values",
199199
TensorShape(output_tokens_shape),
200200
&start_values));
201-
auto start_values_vec = start_values->vec<int64>();
201+
auto start_values_vec = start_values->vec<int64_t>();
202202

203203
Tensor* limit_values;
204204
OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values",
205205
TensorShape(output_tokens_shape),
206206
&limit_values));
207-
auto limit_values_vec = limit_values->vec<int64>();
207+
auto limit_values_vec = limit_values->vec<int64_t>();
208208

209209
for (int i = 0; i < tokens.size(); ++i) {
210210
output_values_vec(i) = tokens[i];

tensorflow_text/core/kernels/unicode_script_tokenize_kernel.cc

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
5858
void Compute(OpKernelContext* context) override {
5959
// Get inputs
6060
const Tensor& input_values_tensor = context->input(0);
61-
const auto input_values_flat = input_values_tensor.flat<int32>();
61+
const auto input_values_flat = input_values_tensor.flat<int32_t>();
6262
const Tensor& input_splits_tensor = context->input(1);
6363
const auto input_splits_flat = input_splits_tensor.flat<SPLITS_TYPE>();
6464

@@ -80,10 +80,10 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
8080
auto output_outer_splits_flat =
8181
output_outer_splits_tensor->flat<SPLITS_TYPE>();
8282

83-
std::vector<int32> output_values;
83+
std::vector<int32_t> output_values;
8484
std::vector<SPLITS_TYPE> output_values_inner_splits;
85-
std::vector<int64> output_offset_starts;
86-
std::vector<int64> output_offset_limits;
85+
std::vector<int64_t> output_offset_starts;
86+
std::vector<int64_t> output_offset_limits;
8787

8888
// Loop over the codepoints (a split at a time) and create splits of tokens.
8989
icu::ErrorCode status;
@@ -92,12 +92,13 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
9292
output_outer_splits_flat(splits_idx) = output_offset_starts.size();
9393
UScriptCode prev_script = USCRIPT_INVALID_CODE;
9494
bool token_has_start_set = false;
95-
int32 curr_skipped_spaces = 0; // Used when computing the end of a token
95+
int32_t curr_skipped_spaces =
96+
0; // Used when computing the end of a token
9697
const int curr_word_start_idx = input_splits_flat(splits_idx);
9798
bool was_space = false;
9899
for (int values_idx = curr_word_start_idx;
99100
values_idx < input_splits_flat(splits_idx + 1); values_idx++) {
100-
const int32 input_value = input_values_flat(values_idx);
101+
const int32_t input_value = input_values_flat(values_idx);
101102
const bool is_space = u_isUWhiteSpace(input_value);
102103
UScriptCode script = uscript_getScript(input_value, status);
103104
// Split these failures out as if they are a different code and ignore
@@ -166,11 +167,11 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
166167
do { \
167168
} while (false)
168169

169-
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32);
170+
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32_t);
170171
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values_inner_splits,
171172
SPLITS_TYPE);
172-
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64);
173-
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64);
173+
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64_t);
174+
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64_t);
174175

175176
#undef DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR
176177
}
@@ -183,12 +184,12 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
183184

184185
REGISTER_KERNEL_BUILDER(Name("UnicodeScriptTokenizeWithOffsets")
185186
.Device(DEVICE_CPU)
186-
.TypeConstraint<int32>("Tsplits"),
187-
UnicodeScriptTokenizeWithOffsetsOp<int32>);
187+
.TypeConstraint<int32_t>("Tsplits"),
188+
UnicodeScriptTokenizeWithOffsetsOp<int32_t>);
188189
REGISTER_KERNEL_BUILDER(Name("UnicodeScriptTokenizeWithOffsets")
189190
.Device(DEVICE_CPU)
190-
.TypeConstraint<int64>("Tsplits"),
191-
UnicodeScriptTokenizeWithOffsetsOp<int64>);
191+
.TypeConstraint<int64_t>("Tsplits"),
192+
UnicodeScriptTokenizeWithOffsetsOp<int64_t>);
192193

193194
} // namespace text
194195
} // namespace tensorflow

tensorflow_text/core/kernels/whitespace_tokenize_kernel.cc

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class WhitespaceTokenizeWithOffsetsOp : public OpKernel {
5454
void Compute(OpKernelContext* context) override {
5555
// Get inputs
5656
const Tensor& input_values_tensor = context->input(0);
57-
const auto input_values_flat = input_values_tensor.flat<int32>();
57+
const auto input_values_flat = input_values_tensor.flat<int32_t>();
5858
const Tensor& input_splits_tensor = context->input(1);
5959
const auto input_splits_flat = input_splits_tensor.flat<SPLITS_TYPE>();
6060

@@ -76,17 +76,18 @@ class WhitespaceTokenizeWithOffsetsOp : public OpKernel {
7676
auto output_outer_splits_flat =
7777
output_outer_splits_tensor->flat<SPLITS_TYPE>();
7878

79-
std::vector<int32> output_values;
79+
std::vector<int32_t> output_values;
8080
std::vector<SPLITS_TYPE> output_values_inner_splits;
81-
std::vector<int64> output_offset_starts;
82-
std::vector<int64> output_offset_limits;
81+
std::vector<int64_t> output_offset_starts;
82+
std::vector<int64_t> output_offset_limits;
8383

8484
// Loop over the codepoints (a split at a time) and create splits of tokens.
8585
for (int splits_idx = 0; splits_idx < input_splits_flat.size() - 1;
8686
splits_idx++) {
8787
output_outer_splits_flat(splits_idx) = output_offset_starts.size();
8888
bool token_has_start_set = false;
89-
int32 curr_skipped_spaces = 0; // Used when computing the end of a token
89+
int32_t curr_skipped_spaces =
90+
0; // Used when computing the end of a token
9091
const int curr_word_start_idx = input_splits_flat(splits_idx);
9192
for (int values_idx = curr_word_start_idx;
9293
values_idx < input_splits_flat(splits_idx + 1); values_idx++) {
@@ -135,11 +136,11 @@ class WhitespaceTokenizeWithOffsetsOp : public OpKernel {
135136
auto name##_data = name##_tensor->flat<dtype>().data(); \
136137
memcpy(name##_data, name.data(), name##_size * sizeof(dtype));
137138

138-
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32);
139+
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32_t);
139140
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values_inner_splits,
140141
SPLITS_TYPE);
141-
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64);
142-
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64);
142+
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64_t);
143+
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64_t);
143144

144145
#undef DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR
145146
}
@@ -150,12 +151,12 @@ class WhitespaceTokenizeWithOffsetsOp : public OpKernel {
150151

151152
REGISTER_KERNEL_BUILDER(Name("WhitespaceTokenizeWithOffsets")
152153
.Device(DEVICE_CPU)
153-
.TypeConstraint<int32>("Tsplits"),
154-
WhitespaceTokenizeWithOffsetsOp<int32>);
154+
.TypeConstraint<int32_t>("Tsplits"),
155+
WhitespaceTokenizeWithOffsetsOp<int32_t>);
155156
REGISTER_KERNEL_BUILDER(Name("WhitespaceTokenizeWithOffsets")
156157
.Device(DEVICE_CPU)
157-
.TypeConstraint<int64>("Tsplits"),
158-
WhitespaceTokenizeWithOffsetsOp<int64>);
158+
.TypeConstraint<int64_t>("Tsplits"),
159+
WhitespaceTokenizeWithOffsetsOp<int64_t>);
159160

160161
} // namespace text
161162
} // namespace tensorflow

0 commit comments

Comments
 (0)