Sample Header Ad - 728x90

Optimize extraction of json data via OPENJSON

7 votes
2 answers
4506 views
I'm attempting to optimize extraction of values obtained from a REST API which returns json values in an array. Here's an minimal, complete, and verifiable example that reflects exactly what I'm doing.
USE tempdb;

DROP TABLE IF EXISTS dbo.json_test;

CREATE TABLE dbo.json_test
(
    json_test_id                int                 NOT NULL
        IDENTITY(1,1)
    , some_uniqueidentifier     uniqueidentifier    NULL
    , some_varchar              varchar(100)        NULL
    , the_json                  nvarchar(max)       NULL
);

INSERT INTO dbo.json_test (some_uniqueidentifier, some_varchar, the_json)
SELECT 
        some_uniqueidentifier       = NEWID()
      , some_varchar                = CONVERT(varchar(100), CRYPT_GEN_RANDOM(64), 1)
      , the_json = (
            SELECT st.* 
            FROM sys.tables st
                CROSS JOIN sys.tables st2
            WHERE st.object_id = t.object_id FOR JSON AUTO
            )
FROM sys.tables t;

;WITH src AS 
(
    SELECT jt.some_uniqueidentifier
        , jt.some_varchar
        , top_array.[key]
        , top_array.[value]
    FROM dbo.json_test jt
        CROSS APPLY OPENJSON(jt.the_json, N'lax $') top_array
),
src2 AS
(
    SELECT src.some_uniqueidentifier
        , src.some_varchar
        , src.[key]
        , src.[value]
        , inner_key = inner_array.[key]
        , inner_value = inner_array.[value]
    FROM src
        CROSS APPLY OPENJSON(src.[value], N'lax $') inner_array
)
SELECT src2.some_uniqueidentifier
    , src2.some_varchar
    , src2.[key]
    , [name]                                = MAX(CASE WHEN src2.[inner_key] = 'name'                               THEN src2.[inner_value] ELSE NULL END)
    , [object_id]                           = MAX(CASE WHEN src2.[inner_key] = 'object_id'                          THEN src2.[inner_value] ELSE NULL END)
    , [principal_id]                        = MAX(CASE WHEN src2.[inner_key] = 'principal_id'                       THEN src2.[inner_value] ELSE NULL END)
    , [schema_id]                           = MAX(CASE WHEN src2.[inner_key] = 'schema_id'                          THEN src2.[inner_value] ELSE NULL END)
    , [parent_object_id]                    = MAX(CASE WHEN src2.[inner_key] = 'parent_object_id'                   THEN src2.[inner_value] ELSE NULL END)
    , [type]                                = MAX(CASE WHEN src2.[inner_key] = 'type'                               THEN src2.[inner_value] ELSE NULL END)
    , [type_desc]                           = MAX(CASE WHEN src2.[inner_key] = 'type_desc'                          THEN src2.[inner_value] ELSE NULL END)
    , [create_date]                         = MAX(CASE WHEN src2.[inner_key] = 'create_date'                        THEN src2.[inner_value] ELSE NULL END)
    , [modify_date]                         = MAX(CASE WHEN src2.[inner_key] = 'modify_date'                        THEN src2.[inner_value] ELSE NULL END)
    , [is_ms_shipped]                       = MAX(CASE WHEN src2.[inner_key] = 'is_ms_shipped'                      THEN src2.[inner_value] ELSE NULL END)
    , [is_published]                        = MAX(CASE WHEN src2.[inner_key] = 'is_published'                       THEN src2.[inner_value] ELSE NULL END)
    , [is_schema_published]                 = MAX(CASE WHEN src2.[inner_key] = 'is_schema_published'                THEN src2.[inner_value] ELSE NULL END)
    , [lob_data_space_id]                   = MAX(CASE WHEN src2.[inner_key] = 'lob_data_space_id'                  THEN src2.[inner_value] ELSE NULL END)
    , [filestream_data_space_id]            = MAX(CASE WHEN src2.[inner_key] = 'filestream_data_space_id'           THEN src2.[inner_value] ELSE NULL END)
    , [max_column_id_used]                  = MAX(CASE WHEN src2.[inner_key] = 'max_column_id_used'                 THEN src2.[inner_value] ELSE NULL END)
    , [lock_on_bulk_load]                   = MAX(CASE WHEN src2.[inner_key] = 'lock_on_bulk_load'                  THEN src2.[inner_value] ELSE NULL END)
    , [uses_ansi_nulls]                     = MAX(CASE WHEN src2.[inner_key] = 'uses_ansi_nulls'                    THEN src2.[inner_value] ELSE NULL END)
    , [is_replicated]                       = MAX(CASE WHEN src2.[inner_key] = 'is_replicated'                      THEN src2.[inner_value] ELSE NULL END)
    , [has_replication_filter]              = MAX(CASE WHEN src2.[inner_key] = 'has_replication_filter'             THEN src2.[inner_value] ELSE NULL END)
    , [is_merge_published]                  = MAX(CASE WHEN src2.[inner_key] = 'is_merge_published'                 THEN src2.[inner_value] ELSE NULL END)
    , [is_sync_tran_subscribed]             = MAX(CASE WHEN src2.[inner_key] = 'is_sync_tran_subscribed'            THEN src2.[inner_value] ELSE NULL END)
    , [has_unchecked_assembly_data]         = MAX(CASE WHEN src2.[inner_key] = 'has_unchecked_assembly_data'        THEN src2.[inner_value] ELSE NULL END)
    , [text_in_row_limit]                   = MAX(CASE WHEN src2.[inner_key] = 'text_in_row_limit'                  THEN src2.[inner_value] ELSE NULL END)
    , [large_value_types_out_of_row]        = MAX(CASE WHEN src2.[inner_key] = 'large_value_types_out_of_row'       THEN src2.[inner_value] ELSE NULL END)
    , [is_tracked_by_cdc]                   = MAX(CASE WHEN src2.[inner_key] = 'is_tracked_by_cdc'                  THEN src2.[inner_value] ELSE NULL END)
    , [lock_escalation]                     = MAX(CASE WHEN src2.[inner_key] = 'lock_escalation'                    THEN src2.[inner_value] ELSE NULL END)
    , [lock_escalation_desc]                = MAX(CASE WHEN src2.[inner_key] = 'lock_escalation_desc'               THEN src2.[inner_value] ELSE NULL END)
    , [is_filetable]                        = MAX(CASE WHEN src2.[inner_key] = 'is_filetable'                       THEN src2.[inner_value] ELSE NULL END)
    , [is_memory_optimized]                 = MAX(CASE WHEN src2.[inner_key] = 'is_memory_optimized'                THEN src2.[inner_value] ELSE NULL END)
    , [durability]                          = MAX(CASE WHEN src2.[inner_key] = 'durability'                         THEN src2.[inner_value] ELSE NULL END)
    , [durability_desc]                     = MAX(CASE WHEN src2.[inner_key] = 'durability_desc'                    THEN src2.[inner_value] ELSE NULL END)
    , [temporal_type]                       = MAX(CASE WHEN src2.[inner_key] = 'temporal_type'                      THEN src2.[inner_value] ELSE NULL END)
    , [temporal_type_desc]                  = MAX(CASE WHEN src2.[inner_key] = 'temporal_type_desc'                 THEN src2.[inner_value] ELSE NULL END)
    , [history_table_id]                    = MAX(CASE WHEN src2.[inner_key] = 'history_table_id'                   THEN src2.[inner_value] ELSE NULL END)
    , [is_remote_data_archive_enabled]      = MAX(CASE WHEN src2.[inner_key] = 'is_remote_data_archive_enabled'     THEN src2.[inner_value] ELSE NULL END)
    , [is_external]                         = MAX(CASE WHEN src2.[inner_key] = 'is_external'                        THEN src2.[inner_value] ELSE NULL END)
    , [history_retention_period]            = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period'           THEN src2.[inner_value] ELSE NULL END)
    , [history_retention_period_unit]       = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period_unit'      THEN src2.[inner_value] ELSE NULL END)
    , [history_retention_period_unit_desc]  = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period_unit_desc' THEN src2.[inner_value] ELSE NULL END)
    , [is_node]                             = MAX(CASE WHEN src2.[inner_key] = 'is_node'                            THEN src2.[inner_value] ELSE NULL END)
    , [is_edge]                             = MAX(CASE WHEN src2.[inner_key] = 'is_edge'                            THEN src2.[inner_value] ELSE NULL END)
FROM src2
GROUP BY src2.some_uniqueidentifier
    , src2.some_varchar
    , src2.[key]
ORDER BY src2.some_uniqueidentifier
    , src2.some_varchar
    , src2.[key];
The [query plan](https://www.brentozar.com/pastetheplan/?id=Skcotv1AO) uses a couple of nested loops joins regardless of how many rows are contained in the input table. Presumably that's an artifact of using the CROSS APPLY operator. For your fun, I've already set up a [DB Fiddle](https://dbfiddle.uk/?rdbms=sqlserver_2019&sample=adventureworks&fiddle=a548cf359ff55fa77a16677cc05050d6) . Is there a more efficient way of getting the data out of json format into a "real" set of columns? I created the code above using my on-prem SQL Server 2019 instance, however the target will be Azure SQL Database, so all the latest-and-greatest options are available.
Asked by Hannah Vernon (70988 rep)
Jul 16, 2021, 08:38 PM
Last activity: Sep 24, 2021, 04:33 PM