I believe I am hitting a bug with moving data from a UTF8 to a non-UTF8 collations in SQL Server 2022 CU13 (running under Linux but assuming for now this issue is not related).
The issue is that MSSQL appears to be determining the width of a string for insertion into a column using its source (UTF8) size not it's destination size. Yes understand string sizes are now in bytes not characters and that's the issue -- the size of the string appears to be calculated based on the encoding in the source collation (UTF8) and then generating an error because the destination column can't hold that many bytes. However, once converted to the destination column's collation (i.e. associated encoding of character set) the string requires less bytes and would fit. If not for the false error blocking it...
Note that starting with the same data in nvarchar and inserting it into the same destination column works fine. Also it can be verified that the data fits in the destination column by inserting it into a wider column with the same collation as the original destination and then measuring its size after insertion.
Below is SQL to reproduce. You'll see its mostly variations on the same thing up until the last steps where it generates the error when trying to insert a single GBP symbol (U+00A3) into a varchar(1) COLLATE [Windows-1252] column. While this symbol requires 2 bytes in UTF-8, it only requires 1 byte in a Windows-1252 codepage so should fit in the 1 character destination column except it errors if the source of that character is from an UTF-8 encoded column..
SET NOCOUNT ON;
DROP TABLE IF EXISTS A1252;
DROP TABLE IF EXISTS AUCS2;
DROP TABLE IF EXISTS AUTF8;
DROP TABLE IF EXISTS A1252b;
GO
CREATE TABLE A1252 (s varchar(1) COLLATE SQL_Latin1_General_CP1_CI_AS);
CREATE TABLE AUCS2 (s nvarchar(1) COLLATE Latin1_General_100_BIN2_UTF8);
CREATE TABLE AUTF8 (s varchar(2) COLLATE Latin1_General_100_BIN2_UTF8);
CREATE TABLE A1252b (s varchar(2) COLLATE SQL_Latin1_General_CP1_CI_AS);
INSERT INTO A1252
SELECT NCHAR(0x0024) as s; --USD symbol
INSERT INTO AUCS2
SELECT NCHAR(0x0024) as s; --USD symbol
INSERT INTO AUTF8
SELECT NCHAR(0x0024) as s; --USD symbol
--This will show a USD symbol is 1 character, 1 byte long in Windows-1252
SELECT s AS s_asWin1252
,LEN(s) LenOf_s_asWin1252
,DATALENGTH(s) as OctetLenOf_s_asWin1252
FROM A1252;
--This will show a USD symbol is 1 character, 2 bytes long in UCS-2
SELECT s AS s_asUCS2
,LEN(s) LenOf_s_asUCS2
,DATALENGTH(s) as OctetLenOf_s_asUCS2
FROM AUCS2;
--This will show a USD symbol is 1 character, 1 byte long in UTF-8
SELECT s AS s_asUTF8
,LEN(s) LenOf_s_asUTF8
,DATALENGTH(s) as OctetLenOf_s_asUTF8
FROM AUTF8;
--All good up to here, now let's try moving the data around
TRUNCATE TABLE A1252;
INSERT INTO A1252
SELECT s
FROM AUCS2;
TRUNCATE TABLE A1252;
INSERT INTO A1252
SELECT s
FROM AUTF8;
TRUNCATE TABLE AUCS2;
INSERT INTO AUCS2
SELECT s
FROM AUTF8;
TRUNCATE TABLE AUTF8;
INSERT INTO AUTF8
SELECT s
FROM AUCS2;
TRUNCATE TABLE A1252;
INSERT INTO A1252
SELECT s
FROM AUTF8;
--All good to here and confirm character and byte lengths as expected:
SELECT s AS s_asWin1252
,LEN(s) LenOf_s_asWin1252
,DATALENGTH(s) as OctetLenOf_s_asWin1252
FROM A1252;
SELECT s AS s_asUCS2
,LEN(s) LenOf_s_asUCS2
,DATALENGTH(s) as OctetLenOf_s_asUCS2
FROM AUCS2;
SELECT s AS s_asUTF8
,LEN(s) LenOf_s_asUTF8
,DATALENGTH(s) as OctetLenOf_s_asUTF8
FROM AUTF8;
--All good up to here -- now let's try the GBP symbol
INSERT INTO A1252
SELECT NCHAR(0x00A3) as s; --GBP symbol
INSERT INTO AUCS2
SELECT NCHAR(0x00A3) as s; --GBP symbol
INSERT INTO AUTF8
SELECT NCHAR(0x00A3) as s; --GBP symbol
--This will show a row for each USD and GBP symbols, each 1 character, 1 byte long in Windows-1252
SELECT s AS s_asWin1252
,LEN(s) LenOf_s_asWin1252
,DATALENGTH(s) as OctetLenOf_s_asWin1252
FROM A1252;
--This will show a row for each USD and GBP symbols, each 1 character, 2 bytes long in UCS-2
SELECT s AS s_asUCS2
,LEN(s) LenOf_s_asUCS2
,DATALENGTH(s) as OctetLenOf_s_asUCS2
FROM AUCS2;
--This will show a row USD symbol, 1 character and 1 byte long in UTF-8 and a GBP symbol, 1 character and 2 bytes long in UTF-8
SELECT s AS s_asUTF8
,LEN(s) LenOf_s_asUTF8
,DATALENGTH(s) as OctetLenOf_s_asUTF8
FROM AUTF8;
--All good up to here, now let's try moving the data around again
TRUNCATE TABLE A1252;
INSERT INTO A1252
SELECT s
FROM AUCS2;
TRUNCATE TABLE AUTF8;
INSERT INTO AUTF8
SELECT s
FROM AUCS2;
TRUNCATE TABLE AUCS2;
INSERT INTO AUCS2
SELECT s
FROM AUTF8;
--All good to here and confirm character and byte lengths as expected:
SELECT s AS s_asWin1252
,LEN(s) LenOf_s_asWin1252
,DATALENGTH(s) as OctetLenOf_s_asWin1252
FROM A1252;
SELECT s AS s_asUCS2
,LEN(s) LenOf_s_asUCS2
,DATALENGTH(s) as OctetLenOf_s_asUCS2
FROM AUCS2
SELECT s AS s_asUTF8
,LEN(s) LenOf_s_asUTF8
,DATALENGTH(s) as OctetLenOf_s_asUTF8
FROM AUTF8
--Though wasn't needed, this will of course work With the wider maximum column width:
INSERT INTO A1252b
SELECT s
FROM AUTF8
--Confirming extra column width not actually needed:
SELECT s AS s_asWin1252
,LEN(s) LenOf_s_asWin1252
,DATALENGTH(s) as OctetLenOf_s_Win1252
FROM A1252b
--Emptying A1252 to avoid ambiguity
TRUNCATE TABLE A1252;
--Confirming one last time that data in AUTF8 only requires 1 character/byte as Win1252
SELECT s AS s_asUTF8,s COLLATE SQL_Latin1_General_CP1_CI_AS as s_asWin1252
,LEN(s) LenOf_s_asUTF8,LEN(s COLLATE SQL_Latin1_General_CP1_CI_AS) as LenOf_s_asWin1252
,DATALENGTH(s) as OctetLenOf_s_asUTF8,DATALENGTH(s COLLATE SQL_Latin1_General_CP1_CI_AS) as OctetLenOf_s_asWin1252
FROM AUTF8;
PRINT 'The next SQL statement will fail claiming truncation but it should have worked as well as when the data came from AUCS2'
SELECT TOP(0) NULL AS 'The next SQL statement will fail claiming truncation but it should have worked as well as when the data came from AUCS2'
--This is failing but it should work the same as it did when coming from AUCS2
INSERT INTO A1252
SELECT s
FROM AUTF8;
SELECT TOP(0) NULL AS 'Contents of AUTF8 that are supposed to be in A1252'
SELECT s AS s_asWin1252
,LEN(s) LenOf_s_asWin1252
,DATALENGTH(s) as OctetLenOf_s_Win1252
FROM AUTF8;
SELECT TOP(0) NULL AS 'Contents of A1252 that should have been the same as AUTF8';
SELECT s AS s_asWin1252
,LEN(s) LenOf_s_asWin1252
,DATALENGTH(s) as OctetLenOf_s_Win1252
FROM A1252;