Skip to content

Commit 8c63de8

Browse files
committed
Implement #scan_integer to efficiently parse Integer
Fix: #113 This allows to directly parse an Integer from a String without needing to first allocate a sub string. Notes: The implementation is limited by design, it's meant as a first step, only the most straightforward, based 10 integers are supported.
1 parent 81a80a1 commit 8c63de8

3 files changed

Lines changed: 141 additions & 0 deletions

File tree

ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import org.jruby.runtime.builtin.IRubyObject;
5555
import org.jruby.util.ByteList;
5656
import org.jruby.util.StringSupport;
57+
import org.jruby.util.ConvertBytes;
5758

5859
import java.util.Iterator;
5960

@@ -556,6 +557,47 @@ public IRubyObject peep(ThreadContext context, IRubyObject length) {
556557
return peek(context, length);
557558
}
558559

560+
@JRubyMethod(name = "scan_integer")
561+
public IRubyObject scan_integer(ThreadContext context) {
562+
final Ruby runtime = context.runtime;
563+
check(context);
564+
clearMatched();
565+
566+
if (!str.getEncoding().isAsciiCompatible()) {
567+
throw getRuntime().newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding());
568+
}
569+
570+
571+
ByteList bytes = str.getByteList();
572+
int curr = this.curr;
573+
574+
int bite = bytes.get(curr);
575+
if (bite == '-' || bite == '+') {
576+
curr++;
577+
bite = bytes.get(curr);
578+
}
579+
580+
if (!(bite >= '0' && bite <= '9')) {
581+
return runtime.getNil();
582+
}
583+
584+
while (bite >= '0' && bite <= '9') {
585+
curr++;
586+
if (curr >= bytes.getRealSize()) {
587+
break;
588+
}
589+
bite = bytes.get(curr);
590+
}
591+
592+
int length = curr - this.curr;
593+
prev = this.curr;
594+
this.curr = curr;
595+
setMatched();
596+
adjustRegisters();
597+
598+
return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 10, true);
599+
}
600+
559601
@JRubyMethod(name = "unscan")
560602
public IRubyObject unscan(ThreadContext context) {
561603
check(context);

ext/strscan/strscan.c

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ extern size_t onig_region_memsize(const struct re_registers *regs);
2121
#endif
2222

2323
#include <stdbool.h>
24+
#include <ctype.h>
2425

2526
#define STRSCAN_VERSION "3.1.1"
2627

@@ -115,6 +116,7 @@ static VALUE strscan_get_byte _((VALUE self));
115116
static VALUE strscan_getbyte _((VALUE self));
116117
static VALUE strscan_peek _((VALUE self, VALUE len));
117118
static VALUE strscan_peep _((VALUE self, VALUE len));
119+
static VALUE strscan_scan_integer _((VALUE self));
118120
static VALUE strscan_unscan _((VALUE self));
119121
static VALUE strscan_bol_p _((VALUE self));
120122
static VALUE strscan_eos_p _((VALUE self));
@@ -1266,6 +1268,52 @@ strscan_peep(VALUE self, VALUE vlen)
12661268
return strscan_peek(self, vlen);
12671269
}
12681270

1271+
/*
1272+
* call-seq:
1273+
* scan_integer
1274+
*
1275+
* Equivalent to #scan with a \-?\d+ pattern, and returns an Integer or nil.
1276+
*/
1277+
static VALUE
1278+
strscan_scan_integer(VALUE self)
1279+
{
1280+
char *ptr, *buffer;
1281+
long len = 0;
1282+
VALUE buffer_v, integer;
1283+
struct strscanner *p;
1284+
1285+
GET_SCANNER(self, p);
1286+
CLEAR_MATCH_STATUS(p);
1287+
1288+
rb_must_asciicompat(p->str);
1289+
1290+
ptr = CURPTR(p);
1291+
1292+
if (ptr[len] == '-' || ptr[len] == '+') {
1293+
len++;
1294+
}
1295+
1296+
if (!isdigit(ptr[len])) {
1297+
return Qnil;
1298+
}
1299+
1300+
MATCHED(p);
1301+
p->prev = p->curr;
1302+
1303+
while(isdigit(ptr[len])) {
1304+
len++;
1305+
}
1306+
1307+
buffer = ALLOCV_N(char, buffer_v, len + 1);
1308+
1309+
MEMCPY(buffer, CURPTR(p), char, len);
1310+
buffer[len] = '\0';
1311+
integer = rb_cstr2inum(buffer, 10);
1312+
RB_GC_GUARD(buffer_v);
1313+
p->curr += len;
1314+
return integer;
1315+
}
1316+
12691317
/*
12701318
* :markup: markdown
12711319
* :include: strscan/link_refs.txt
@@ -2204,6 +2252,8 @@ Init_strscan(void)
22042252
rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0);
22052253
rb_define_method(StringScanner, "peep", strscan_peep, 1);
22062254

2255+
rb_define_method(StringScanner, "scan_integer", strscan_scan_integer, 0);
2256+
22072257
rb_define_method(StringScanner, "unscan", strscan_unscan, 0);
22082258

22092259
rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);

test/strscan/test_stringscanner.rb

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,55 @@ def test_named_captures
890890
assert_equal(9, scan.match?(/(?<f>foo)(?<r>bar)(?<z>baz)/))
891891
assert_equal({"f" => "foo", "r" => "bar", "z" => "baz"}, scan.named_captures)
892892
end
893+
894+
def test_scan_integer
895+
s = create_string_scanner('abc')
896+
assert_nil s.scan_integer
897+
assert_equal 0, s.pos
898+
refute_predicate s, :matched?
899+
900+
s = create_string_scanner('123abc')
901+
assert_equal 123, s.scan_integer
902+
assert_equal 3, s.pos
903+
assert_predicate s, :matched?
904+
905+
s = create_string_scanner('-123abc')
906+
assert_equal -123, s.scan_integer
907+
assert_equal 4, s.pos
908+
assert_predicate s, :matched?
909+
910+
s = create_string_scanner('+123')
911+
assert_equal 123, s.scan_integer
912+
assert_equal 4, s.pos
913+
assert_predicate s, :matched?
914+
915+
s = create_string_scanner('-abc')
916+
assert_nil s.scan_integer
917+
assert_equal 0, s.pos
918+
refute_predicate s, :matched?
919+
920+
huge_integer = '1' * 2_000
921+
s = create_string_scanner(huge_integer)
922+
assert_equal huge_integer.to_i, s.scan_integer
923+
assert_equal 2_000, s.pos
924+
assert_predicate s, :matched?
925+
end
926+
927+
def test_scan_integer_unmatch
928+
s = create_string_scanner('123abc')
929+
assert_equal 123, s.scan_integer
930+
assert_equal 3, s.pos
931+
932+
s.unscan
933+
assert_equal 0, s.pos
934+
end
935+
936+
def test_scan_integer_encoding
937+
s = create_string_scanner('123abc'.encode(Encoding::UTF_32LE))
938+
assert_raise(Encoding::CompatibilityError) do
939+
s.scan_integer
940+
end
941+
end
893942
end
894943

895944
class TestStringScanner < Test::Unit::TestCase

0 commit comments

Comments
 (0)