-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.go
More file actions
99 lines (87 loc) · 2.06 KB
/
utils.go
File metadata and controls
99 lines (87 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package main
import (
"crypto/sha256"
"encoding/binary"
"io"
"strings"
"unicode"
"unicode/utf8"
)
func SplitWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Skip leading spaces.
start := 0
for start < len(data) {
r, width := utf8.DecodeRune(data[start:])
if !unicode.IsSpace(r) {
break
}
start += width
}
// Scan until whitespace, marking end of word.
curr := start
punctWidth := 0
for curr < len(data) {
r, width := utf8.DecodeRune(data[curr:])
if unicode.IsSpace(r) && curr > start {
// Return current word excluding end punctuation
return curr - punctWidth, data[start : curr-punctWidth], nil
}
if isPunctuation(r) {
if curr == start {
// If punctuation at start of word, return it alone
return curr + width, data[start : curr+width], nil
} else {
// If punctuation in between word, exclude from returned token
punctWidth += width
}
} else {
punctWidth = 0
}
curr += width
}
// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
if atEOF && curr > start {
return curr, data[start:curr], nil
}
// Request more data.
return start, nil, nil
}
func isPunctuation(r rune) bool {
return !(unicode.IsSpace(r) || unicode.IsLetter(r) || unicode.IsNumber(r))
}
func isOpeningPunct(r rune) bool {
return unicode.In(r, unicode.Ps, unicode.Pi)
}
func isClosingPunct(r rune) bool {
return unicode.In(r, unicode.Pe, unicode.Po, unicode.Pf)
}
func StringHasher(s []string) string {
return strings.Join(s, " ")
}
func StringHasherConst3(s []string) [3]string {
if len(s) != 3 {
panic("unknown len")
}
return [3]string{s[0], s[1], s[2]}
}
func Int64FromBytes(data []byte) int64 {
hash := sha256.Sum256(data)
seed := int64(binary.BigEndian.Uint64(hash[:8]))
return seed
}
type CountingWriter struct {
w io.Writer
C int
}
var _ io.Writer = &CountingWriter{}
func NewCountingWriter(w io.Writer) *CountingWriter {
return &CountingWriter{
w: w,
C: 0,
}
}
func (w *CountingWriter) Write(p []byte) (int, error) {
n, err := w.w.Write(p)
w.C += n
return n, err
}