From f89d745a1195d88a19d156511a0c3f36f4d59c2c Mon Sep 17 00:00:00 2001 From: Herwin Date: Thu, 15 Jan 2026 14:43:33 +0100 Subject: [PATCH 1/6] Add specs for rb_interned_str and rb_interned_str_cstr --- optional/capi/ext/string_spec.c | 10 ++++ optional/capi/string_spec.rb | 102 ++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/optional/capi/ext/string_spec.c b/optional/capi/ext/string_spec.c index 094013e049..74aa9e56e8 100644 --- a/optional/capi/ext/string_spec.c +++ b/optional/capi/ext/string_spec.c @@ -581,6 +581,14 @@ static VALUE string_spec_rb_str_to_interned_str(VALUE self, VALUE str) { return rb_str_to_interned_str(str); } +static VALUE string_spec_rb_interned_str(VALUE self, VALUE str, VALUE len) { + return rb_interned_str(RSTRING_PTR(str), FIX2LONG(len)); +} + +static VALUE string_spec_rb_interned_str_cstr(VALUE self, VALUE str) { + return rb_interned_str_cstr(RSTRING_PTR(str)); +} + void Init_string_spec(void) { VALUE cls = rb_define_class("CApiStringSpecs", rb_cObject); rb_define_method(cls, "rb_cstr2inum", string_spec_rb_cstr2inum, 2); @@ -681,6 +689,8 @@ void Init_string_spec(void) { rb_define_method(cls, "rb_enc_interned_str_cstr", string_spec_rb_enc_interned_str_cstr, 2); rb_define_method(cls, "rb_enc_interned_str", string_spec_rb_enc_interned_str, 3); rb_define_method(cls, "rb_str_to_interned_str", string_spec_rb_str_to_interned_str, 1); + rb_define_method(cls, "rb_interned_str", string_spec_rb_interned_str, 2); + rb_define_method(cls, "rb_interned_str_cstr", string_spec_rb_interned_str_cstr, 1); } #ifdef __cplusplus diff --git a/optional/capi/string_spec.rb b/optional/capi/string_spec.rb index 72f20ee6a5..4060cfabf4 100644 --- a/optional/capi/string_spec.rb +++ b/optional/capi/string_spec.rb @@ -1373,4 +1373,106 @@ def inspect @s.rb_str_to_interned_str("hello").should.equal?(-"hello") end end + + describe "rb_interned_str" do + it "returns a frozen string" do + str = "hello" + result = @s.rb_interned_str(str, str.bytesize) + result.should.is_a?(String) + result.should.frozen? + result.encoding.should == Encoding::US_ASCII + end + + it "returns the same frozen string" do + str = "hello" + result1 = @s.rb_interned_str(str, str.bytesize) + result2 = @s.rb_interned_str(str, str.bytesize) + result1.should.equal?(result2) + end + + it "supports strings with embedded null bytes" do + str = "foo\x00bar\x00baz".b + result = @s.rb_interned_str(str, str.bytesize) + result.should == str + end + + it "support binary strings that are invalid in ASCII encoding" do + str = "foo\x81bar\x82baz".b + result = @s.rb_interned_str(str, str.bytesize) + result.encoding.should == Encoding::US_ASCII + result.should == str.dup.force_encoding(Encoding::US_ASCII) + result.should_not.valid_encoding? + end + + it "returns the same frozen strings for different encodings" do + str1 = "hello".dup.force_encoding(Encoding::US_ASCII) + str2 = "hello".dup.force_encoding(Encoding::UTF_8) + result1 = @s.rb_interned_str(str1, str1.bytesize) + result2 = @s.rb_interned_str(str2, str2.bytesize) + result1.should.equal?(result2) + end + + it 'returns the same string when using non-ascii characters' do + str = 'こんにちは' + result1 = @s.rb_interned_str(str, str.bytesize) + result2 = @s.rb_interned_str(str, str.bytesize) + result1.should.equal?(result2) + end + + it "returns the same string as String#-@" do + str = "hello".dup.force_encoding(Encoding::US_ASCII) + @s.rb_interned_str(str, str.bytesize).should.equal?(-str) + end + end + + describe "rb_interned_str_cstr" do + it "returns a frozen string" do + str = "hello" + result = @s.rb_interned_str_cstr(str) + result.should.is_a?(String) + result.should.frozen? + result.encoding.should == Encoding::US_ASCII + end + + it "returns the same frozen string" do + str = "hello" + result1 = @s.rb_interned_str_cstr(str) + result2 = @s.rb_interned_str_cstr(str) + result1.should.equal?(result2) + end + + it "does not support strings with embedded null bytes" do + str = "foo\x00bar\x00baz".b + result = @s.rb_interned_str_cstr(str) + result.should == "foo" + end + + it "support binary strings that are invalid in ASCII encoding" do + str = "foo\x81bar\x82baz".b + result = @s.rb_interned_str_cstr(str) + result.encoding.should == Encoding::US_ASCII + result.should == str.dup.force_encoding(Encoding::US_ASCII) + result.should_not.valid_encoding? + end + + it "returns the same frozen strings for different encodings" do + str1 = "hello".dup.force_encoding(Encoding::US_ASCII) + str2 = "hello".dup.force_encoding(Encoding::UTF_8) + result1 = @s.rb_interned_str_cstr(str1) + result2 = @s.rb_interned_str_cstr(str2) + result1.should.equal?(result2) + end + + it 'returns the same string when using non-ascii characters' do + str = 'こんにちは' + result1 = @s.rb_interned_str_cstr(str) + result2 = @s.rb_interned_str_cstr(str) + result1.should.equal?(result2) + end + + it "returns the same string as String#-@" do + str = "hello".dup.force_encoding(Encoding::US_ASCII) + @s.rb_interned_str_cstr(str).should.equal?(-str) + end + end end From 5f23192b4e6cf1f68dfc6f41cc66b22011def6bf Mon Sep 17 00:00:00 2001 From: Herwin Date: Sat, 17 Jan 2026 06:47:50 +0100 Subject: [PATCH 2/6] Update specs for rb_interned_str Updated them to the current status of the upstream bug report, which is all strings are binary. This might change to either ASCII-7BIT or BINARY. The current specs pass with the latest upstream version of Ruby 4.1 (commit 3e13b7d4ef) --- optional/capi/string_spec.rb | 52 ++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/optional/capi/string_spec.rb b/optional/capi/string_spec.rb index 4060cfabf4..86535189b5 100644 --- a/optional/capi/string_spec.rb +++ b/optional/capi/string_spec.rb @@ -1380,7 +1380,9 @@ def inspect result = @s.rb_interned_str(str, str.bytesize) result.should.is_a?(String) result.should.frozen? - result.encoding.should == Encoding::US_ASCII + ruby_bug "21842", ""..."4.1" do + result.encoding.should == Encoding::BINARY + end end it "returns the same frozen string" do @@ -1396,12 +1398,14 @@ def inspect result.should == str end - it "support binary strings that are invalid in ASCII encoding" do - str = "foo\x81bar\x82baz".b - result = @s.rb_interned_str(str, str.bytesize) - result.encoding.should == Encoding::US_ASCII - result.should == str.dup.force_encoding(Encoding::US_ASCII) - result.should_not.valid_encoding? + ruby_bug "21842", ""..."4.1" do + it "support binary strings that are invalid in ASCII encoding" do + str = "foo\x81bar\x82baz".b + result = @s.rb_interned_str(str, str.bytesize) + result.encoding.should == Encoding::BINARY + result.should == str + result.should.valid_encoding? + end end it "returns the same frozen strings for different encodings" do @@ -1419,9 +1423,11 @@ def inspect result1.should.equal?(result2) end - it "returns the same string as String#-@" do - str = "hello".dup.force_encoding(Encoding::US_ASCII) - @s.rb_interned_str(str, str.bytesize).should.equal?(-str) + ruby_bug "21842", ""..."4.1" do + it "returns the same string as String#-@" do + str = "hello".b + @s.rb_interned_str(str, str.bytesize).should.equal?(-str) + end end end @@ -1431,7 +1437,9 @@ def inspect result = @s.rb_interned_str_cstr(str) result.should.is_a?(String) result.should.frozen? - result.encoding.should == Encoding::US_ASCII + ruby_bug "21842", ""..."4.1" do + result.encoding.should == Encoding::BINARY + end end it "returns the same frozen string" do @@ -1447,12 +1455,14 @@ def inspect result.should == "foo" end - it "support binary strings that are invalid in ASCII encoding" do - str = "foo\x81bar\x82baz".b - result = @s.rb_interned_str_cstr(str) - result.encoding.should == Encoding::US_ASCII - result.should == str.dup.force_encoding(Encoding::US_ASCII) - result.should_not.valid_encoding? + ruby_bug "21842", ""..."4.1" do + it "support binary strings that are invalid in ASCII encoding" do + str = "foo\x81bar\x82baz".b + result = @s.rb_interned_str_cstr(str) + result.encoding.should == Encoding::BINARY + result.should == str + result.should.valid_encoding? + end end it "returns the same frozen strings for different encodings" do @@ -1470,9 +1480,11 @@ def inspect result1.should.equal?(result2) end - it "returns the same string as String#-@" do - str = "hello".dup.force_encoding(Encoding::US_ASCII) - @s.rb_interned_str_cstr(str).should.equal?(-str) + ruby_bug "21842", ""..."4.1" do + it "returns the same string as String#-@" do + str = "hello".b + @s.rb_interned_str_cstr(str).should.equal?(-str) + end end end end From f7376683e0abc4da4390ba5026d89784e5c5e8df Mon Sep 17 00:00:00 2001 From: Herwin Date: Sat, 17 Jan 2026 06:57:58 +0100 Subject: [PATCH 3/6] Update specs to new behaviour It the whole string is valid ASCII, it returns US-ASCII encoding. Otherwise, it is BINARY. The current specs pass with the latest upstream version of Ruby 4.1 (commit 78b7646bdb) --- optional/capi/string_spec.rb | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/optional/capi/string_spec.rb b/optional/capi/string_spec.rb index 86535189b5..d32b7eba85 100644 --- a/optional/capi/string_spec.rb +++ b/optional/capi/string_spec.rb @@ -1380,9 +1380,7 @@ def inspect result = @s.rb_interned_str(str, str.bytesize) result.should.is_a?(String) result.should.frozen? - ruby_bug "21842", ""..."4.1" do - result.encoding.should == Encoding::BINARY - end + result.encoding.should == Encoding::US_ASCII end it "returns the same frozen string" do @@ -1399,12 +1397,11 @@ def inspect end ruby_bug "21842", ""..."4.1" do - it "support binary strings that are invalid in ASCII encoding" do + it "uses BINARY encoding for strings that are not valid US-ASCII" do str = "foo\x81bar\x82baz".b result = @s.rb_interned_str(str, str.bytesize) result.encoding.should == Encoding::BINARY result.should == str - result.should.valid_encoding? end end @@ -1425,7 +1422,7 @@ def inspect ruby_bug "21842", ""..."4.1" do it "returns the same string as String#-@" do - str = "hello".b + str = "hello".dup.force_encoding(Encoding::US_ASCII) @s.rb_interned_str(str, str.bytesize).should.equal?(-str) end end @@ -1437,9 +1434,7 @@ def inspect result = @s.rb_interned_str_cstr(str) result.should.is_a?(String) result.should.frozen? - ruby_bug "21842", ""..."4.1" do - result.encoding.should == Encoding::BINARY - end + result.encoding.should == Encoding::US_ASCII end it "returns the same frozen string" do @@ -1456,7 +1451,7 @@ def inspect end ruby_bug "21842", ""..."4.1" do - it "support binary strings that are invalid in ASCII encoding" do + it "uses BINARY encoding for strings that are not valid US-ASCII" do str = "foo\x81bar\x82baz".b result = @s.rb_interned_str_cstr(str) result.encoding.should == Encoding::BINARY @@ -1482,7 +1477,7 @@ def inspect ruby_bug "21842", ""..."4.1" do it "returns the same string as String#-@" do - str = "hello".b + str = "hello".dup.force_encoding(Encoding::US_ASCII) @s.rb_interned_str_cstr(str).should.equal?(-str) end end From 7266f1b9673242ca1091f98264fbdeb1a000b0c2 Mon Sep 17 00:00:00 2001 From: Herwin Date: Sun, 18 Jan 2026 09:29:27 +0100 Subject: [PATCH 4/6] Update specs for rb_interned_str(_cstr) Add full checks for the behaviour around US_ASCII/BINARY encoding choice. Remove some of the checks that depend on the encoding of the source string, since this function uses C char* without any encoding information. --- optional/capi/string_spec.rb | 59 +++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/optional/capi/string_spec.rb b/optional/capi/string_spec.rb index d32b7eba85..ac85eff2f3 100644 --- a/optional/capi/string_spec.rb +++ b/optional/capi/string_spec.rb @@ -1396,21 +1396,26 @@ def inspect result.should == str end - ruby_bug "21842", ""..."4.1" do - it "uses BINARY encoding for strings that are not valid US-ASCII" do - str = "foo\x81bar\x82baz".b - result = @s.rb_interned_str(str, str.bytesize) - result.encoding.should == Encoding::BINARY - result.should == str + it "return US_ASCII encoding for an empty string" do + result = @s.rb_interned_str("", 0) + result.should == "" + result.encoding.should == Encoding::US_ASCII + end + + it "returns US_ASCII encoding for strings of only 7 bit ASCII" do + 0x00.upto(0x7f).each do |char| + result = @s.rb_interned_str(char.chr, 1) + result.encoding.should == Encoding::US_ASCII end end - it "returns the same frozen strings for different encodings" do - str1 = "hello".dup.force_encoding(Encoding::US_ASCII) - str2 = "hello".dup.force_encoding(Encoding::UTF_8) - result1 = @s.rb_interned_str(str1, str1.bytesize) - result2 = @s.rb_interned_str(str2, str2.bytesize) - result1.should.equal?(result2) + ruby_bug "21842", ""..."4.1" do + it "returns BINARY encoding for strings that use the 8th bit" do + 0x80.upto(0xff) do |char| + result = @s.rb_interned_str(char.chr, 1) + result.encoding.should == Encoding::BINARY + end + end end it 'returns the same string when using non-ascii characters' do @@ -1450,22 +1455,26 @@ def inspect result.should == "foo" end - ruby_bug "21842", ""..."4.1" do - it "uses BINARY encoding for strings that are not valid US-ASCII" do - str = "foo\x81bar\x82baz".b - result = @s.rb_interned_str_cstr(str) - result.encoding.should == Encoding::BINARY - result.should == str - result.should.valid_encoding? + it "return US_ASCII encoding for an empty string" do + result = @s.rb_interned_str_cstr("") + result.should == "" + result.encoding.should == Encoding::US_ASCII + end + + it "returns US_ASCII encoding for strings of only 7 bit ASCII" do + 0x01.upto(0x7f).each do |char| + result = @s.rb_interned_str_cstr(char.chr) + result.encoding.should == Encoding::US_ASCII end end - it "returns the same frozen strings for different encodings" do - str1 = "hello".dup.force_encoding(Encoding::US_ASCII) - str2 = "hello".dup.force_encoding(Encoding::UTF_8) - result1 = @s.rb_interned_str_cstr(str1) - result2 = @s.rb_interned_str_cstr(str2) - result1.should.equal?(result2) + ruby_bug "21842", ""..."4.1" do + it "returns BINARY encoding for strings that use the 8th bit" do + 0x80.upto(0xff) do |char| + result = @s.rb_interned_str_cstr(char.chr) + result.encoding.should == Encoding::BINARY + end + end end it 'returns the same string when using non-ascii characters' do From e1c587dbfbabdf55de57b938085d2a9e3f7ea538 Mon Sep 17 00:00:00 2001 From: Herwin Date: Sun, 18 Jan 2026 09:39:15 +0100 Subject: [PATCH 5/6] Add additional test for encoding of rb_str_to_interned_str The documentation in the MRI source refers to rb_interned_str as being similar, this test is just to accentuate the difference between these two funcitons. --- optional/capi/string_spec.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/optional/capi/string_spec.rb b/optional/capi/string_spec.rb index ac85eff2f3..048cde5907 100644 --- a/optional/capi/string_spec.rb +++ b/optional/capi/string_spec.rb @@ -1369,6 +1369,13 @@ def inspect result1.should_not.equal?(result2) end + it "preserves the encoding of the original string" do + result1 = @s.rb_str_to_interned_str("hello".dup.force_encoding(Encoding::US_ASCII)) + result2 = @s.rb_str_to_interned_str("hello".dup.force_encoding(Encoding::UTF_8)) + result1.encoding.should == Encoding::US_ASCII + result2.encoding.should == Encoding::UTF_8 + end + it "returns the same string as String#-@" do @s.rb_str_to_interned_str("hello").should.equal?(-"hello") end From 3553d173777495d0542aab91debfe441648975e5 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Sun, 18 Jan 2026 14:27:28 +0100 Subject: [PATCH 6/6] Fix indentation --- optional/capi/string_spec.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/optional/capi/string_spec.rb b/optional/capi/string_spec.rb index 048cde5907..889f0a6cfe 100644 --- a/optional/capi/string_spec.rb +++ b/optional/capi/string_spec.rb @@ -1419,8 +1419,8 @@ def inspect ruby_bug "21842", ""..."4.1" do it "returns BINARY encoding for strings that use the 8th bit" do 0x80.upto(0xff) do |char| - result = @s.rb_interned_str(char.chr, 1) - result.encoding.should == Encoding::BINARY + result = @s.rb_interned_str(char.chr, 1) + result.encoding.should == Encoding::BINARY end end end @@ -1478,8 +1478,8 @@ def inspect ruby_bug "21842", ""..."4.1" do it "returns BINARY encoding for strings that use the 8th bit" do 0x80.upto(0xff) do |char| - result = @s.rb_interned_str_cstr(char.chr) - result.encoding.should == Encoding::BINARY + result = @s.rb_interned_str_cstr(char.chr) + result.encoding.should == Encoding::BINARY end end end