こういうテキストがあって
"take your time and take it easy."
ユニークにして重複はカウントアップして
以下のような3パターンの結果を得たい
- get_count_text1
- [[and, easy, it, take, time, your], [1.0, 1.0, 1.0, 2.0, 1.0, 1.0]]
- get_count_text2
- [[and, 1.0], [easy, 1.0], [it, 1.0], [take, 2.0], [time, 1.0], [your, 1.0]]
- get_count_text3
- [{count=1.0, text=and}, {count=1.0, text=easy}, {count=1.0, text=it}, {count=2.0, text=take}, {count=1.0, text=time}, {count=1.0, text=your}]
このコードで分割する単位
漢字、ひらがな、カタカナ、半角カタカナ、半角英数、全角英数
それを実現する正規表現
/[々〆〇〻㐂-頻]+|[ぁ-ゟー]+|[゠-ヿ]+|[ヲ-゚]+|[a-zA-Z0-9]+|[a-zA-Z0-9]+/g
コード.gs
function split_text() {
var str = "take your time and take it easy.";
var pattern = /[々〆〇〻㐂-頻]+|[ぁ-ゟー]+|[゠-ヿ]+|[ヲ-゚]+|[a-zA-Z0-9]+|[a-zA-Z0-9]+/g;
var result = str.match(pattern);
var count_text = get_count_text3(result);
Logger.log(count_text);
}
function get_count_text1(array) {
var num = 0;
var word = [];
var count = [];
array = array.sort();
for (var i = 0; i < array.length; i++) {
if(i == 0){//配列内の最初の要素の場合
word.push(array[i]);
count.push(1);
num = i;
}else if(array[i] == array[(i-1)]) {//前の要素と同じ値の場合
count[num] = count[num] + 1;//カウントアップ
}else {//前の要素と違う場合
word.push(array[i]);
count.push(1);
num = num + 1;
}
}
var result = [word, count];
return result;
}
function get_count_text2(array) {
var num = 0;
var result = [];
array = array.sort();
for (var i = 0; i < array.length; i++) {
if(i == 0){
result.push([array[i], 1]);
num = i;
}else if(array[i] == array[(i-1)]) {
result[num][1] = result[num][1] + 1;
}else {
result.push([array[i], 1]);
num = num + 1;
}
}
return result;
}
function get_count_text3(array){
var sorted = array.sort();
var arrays = [];
var count;
var unique_i;
for(var i = 0; i < sorted.length; i++){
var obj = {}
var value = sorted[i];
if(value === sorted[i-1]){
count++;
arrays[unique_i]["count"] = count;
}else{
count = 1;
obj["text"] = value;
obj["count"] = count;
unique_i = arrays.length;
arrays.push(obj);
}
}
return arrays;
}
|