This repository was archived by the owner on Jul 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathwhatlang
More file actions
executable file
·79 lines (68 loc) · 2.89 KB
/
whatlang
File metadata and controls
executable file
·79 lines (68 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python3
# =============================================================================
# @file whatlang
# @brief Utility to report the most likely human language used in the input
# @author Michael Hucka <mhucka@caltech.edu>
# @license Please see the file named LICENSE in the project directory
# @website https://github.com/casics/codeornot
# =============================================================================
import os
import plac
import sys
# Allow this program to be executed directly from the 'bin' directory.
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
import codeornot
from codeornot import textcheck
@plac.annotations(
debug = ('drop into ipdb after parsing', 'flag', 'd'),
version = ('print version info and exit', 'flag', 'v'),
input = 'file or string to test'
)
def main(debug=False, version=False, input=None):
'''`whatlang` is a simple utility to report the human language used in
the text of a file or a string. If the argument given can be interpreted as a
file, it reads and processes the file; if the argument is a string on the
command line, it processes the string instead. It relies on the ftfy package
to clean up the text and CLD2 (a probabilistic language detector) to analyze
the language of the text.'''
# Process arguments
if version:
print('{} version {}'.format(codeornot.__version__.__title__,
codeornot.__version__.__version__))
print('Author: {}'.format(codeornot.__version__.__author__))
print('URL: {}'.format(codeornot.__version__.__url__))
print('License: {}'.format(codeornot.__version__.__license__))
sys.exit()
if len(input) < 1:
raise SystemExit('Need a file or string as input argument')
if os.path.exists(input):
input_type = 'file'
with open(input) as f:
text = f.read()
elif os.path.exists(os.path.join(os.getcwd(), input)):
input_type = 'file'
with open(os.path.join(os.getcwd(), input)) as f:
text = f.read()
else:
input_type = 'string'
text = input
# Test input length and return a meaningful message if it's too short.
if len(text) < textcheck.min_length():
print('Text needs to be at least {} characters long for analysis'
.format(textcheck.min_length()))
sys.exit(1)
# Analyze the text and return the results.
lang = textcheck.majority_language(text, enforce_length=True)
if lang and len(lang) > 0:
print('Best guess at language used in {}: {} ({})'
.format(input_type, lang, textcheck.iso639_to_name(lang)))
if debug:
import ipdb; ipdb.set_trace()
if __name__ == '__main__':
plac.call(main)
# For Emacs users
# ......................................................................
# Local Variables:
# mode: python
# python-indent-offset: 4
# End: