-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathword_histogram.sh
executable file
·86 lines (75 loc) · 2.29 KB
/
word_histogram.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
# Function to remove diacritics from a line of text
remove_diacritics()
{
echo "$1" | sed 'y/ąāáǎàćēéěèęīíǐìłńōóǒòóśūúǔùǖǘǚǜżźĄĀÁǍÀĆĒĘÉĚÈĪÍǏÌŁŃŌÓǑÒÓŚŪÚǓÙǕǗǙǛŻŹ/aaaaaceeeeeiiiilnooooosuuuuuuuuzzAAAAACEEEEEIIIILNOOOOOSUUUUUUUUZZ/'
}
# Process text to calculate word frequencies
process_text() {
local file=$1
local min_word_length=$2
declare -A wordcounts
while IFS= read -r line; do
line=$(remove_diacritics "$line")
line=$(echo "$line" | tr -dc '[:alpha:][:space:]')
for word in $line; do
word=${word,,} # Convert to lowercase
if [ ${#word} -ge "$min_word_length" ]; then
((wordcounts[$word]++))
fi
done
done < "$file"
# Output word frequencies
for word in "${!wordcounts[@]}"; do
echo "$word:${wordcounts[$word]}"
done
}
export -f remove_diacritics process_text
# Main program starts here
min_word_length=0
output_json=false
while getopts ":l:j" opt; do
case $opt in
l)
min_word_length=$OPTARG
;;
j)
output_json=true
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
shift $((OPTIND -1))
# Define the temporary file
temp_file=$(mktemp)
# Check if files are provided
if [ "$#" -eq 0 ]; then
# No files provided, reading from stdin
input="/dev/stdin"
if $output_json; then
process_text "$input" "$min_word_length" > "$temp_file"
jq -Rn '[inputs | split(":") | {(.[0]): (. [1] | tonumber)}] | add' < "$temp_file"
else
process_text "$input" "$min_word_length"
fi
else
# Process files in parallel
export min_word_length
export output_json
if $output_json; then
parallel --will-cite "process_text {} $min_word_length" ::: "$@" > "$temp_file"
jq -Rn '[inputs | split(":") | {(.[0]): (. [1] | tonumber)}] | add' < "$temp_file"
else
parallel --will-cite "process_text {} $min_word_length" ::: "$@" > "$temp_file"
cat "$temp_file" | sort -t: -k2,2nr
fi
fi
# Clean up the temporary file
rm "$temp_file"