Dave
2012-07-11 17:46:17 UTC
Hi All
I am working on a process to quickly consume name/value pair data in SQL Server. Some initial testing suggested that I was using a good approach, however I am now getting inconsistent and mostly terrible performance when trying to pivot the data.
It would be great if you guys could take a look and provide a bit of constructive feedback.
I am using a CLR function developed by Andy Novick to concatenate string data.
http://www.mssqltips.com/sqlservertip/2022/concat-aggregates-sql-server-clr-function/
The query takes about 3 hours to run while a count(*) takes a couple minutes. The CPU’s are pretty much maxed out the entire 3 hours.
A select * into from the result set (select * into dbo.IOTest from dbo.NVPResults) only takes a couple minutes which suggests (to me) that IO is not the issue.
Another anomaly is the query ran in 14 minutes (once, not able to reproduce) when I processed the second half of the data (id % 2 = 1).
(1938525 row(s) affected)
2:50:00 (hours)
(1938079 row(s) affected)
00:14:00
I get pretty much the same execution plan when I remove the stuff() function and replace concat with MAX(). Also, I seem to get a much better execution plan when I add a mod function to final where clause.
I do not see the execution plan via this query. I am not sure how to interpret that.
select er.session_id,
ph.query_plan
from sys.dm_exec_requests er
CROSS APPLY sys.dm_exec_query_plan ( er.plan_handle ) ph
Environment Details:
Virtual Environment
Windows Server 2008 R2 Standard
SQL Server 2012 Developer (11.0.2100.60) x64
4 Processors (X5550 @ 2.67)
8 GB Ram
IO = Multiple data files (1 per virtual disk)
Schema:
The NameValue table is partitioned on “hour”.
2012-06-28 04:00:00.000
2012-06-28 05:00:00.000
2012-06-28 06:00:00.000
Cardinality.
raw.ptNameValue_1 = 850 million
stage.sku = 6 million
CREATE TABLE raw.ptNameValue_1 (
date_ SMALLDATETIME NOT NULL
,SkuID INT
,AttributeID INT
,DataBatchID INT
,Value VARCHAR(MAX)
,Value50 AS CONVERT(VARCHAR(50),Value)
, Hierarchy hierarchyid
)
CREATE CLUSTERED INDEX ic_CSVNameValue ON raw.ptNameValue_1 (date_,SkuID)
WITH (DATA_COMPRESSION=PAGE, SORT_IN_TEMPDB=ON) on ps_nvp_1(date_)
CREATE INDEX ix_CSVNameValue ON raw.ptNameValue_1 (Value50)
WITH (DATA_COMPRESSION=PAGE, SORT_IN_TEMPDB=ON) on ps_nvp_1(date_)
CREATE NONCLUSTERED COLUMNSTORE INDEX csindx_simple
ON raw.ptNameValue_1
(SkuID
,AttributeID
,DataBatchID
)
--use #a to filter on attributeid
--use #s to filter on skuid
;WITH p as (
SELECT p1.skuid,p1.attributeid , p1.value50
--select count(*)
from raw.ptNameValue_1 p1 with(nolock)
inner join #s s on s.skuid = p1.skuid --this is just a filter
inner join #a a on a.attributeid = p1.attributeid – this is just a filter
),results as (
select p1.skuid
,dbo.concat(case when p1.attributeid = 203 THEN isnull(CHAR(13) + CHAR(10) + value50,'') else '' end,'') [Col1]
,dbo.concat(case when p1.attributeid = 253 THEN isnull(CHAR(13) + CHAR(10) + value50,'') else '' end,'') [Col2]
,dbo.concat(case when p1.attributeid = 254 THEN isnull(CHAR(13) + CHAR(10) + value50,'') else '' end,'') [CoN]
from p p1
group by skuid)
--select count(*)
--from results
insert into FlatResults
select r.*
from results r
where skuid %2 = 1 --this forces a better execution plan
I am working on a process to quickly consume name/value pair data in SQL Server. Some initial testing suggested that I was using a good approach, however I am now getting inconsistent and mostly terrible performance when trying to pivot the data.
It would be great if you guys could take a look and provide a bit of constructive feedback.
I am using a CLR function developed by Andy Novick to concatenate string data.
http://www.mssqltips.com/sqlservertip/2022/concat-aggregates-sql-server-clr-function/
The query takes about 3 hours to run while a count(*) takes a couple minutes. The CPU’s are pretty much maxed out the entire 3 hours.
A select * into from the result set (select * into dbo.IOTest from dbo.NVPResults) only takes a couple minutes which suggests (to me) that IO is not the issue.
Another anomaly is the query ran in 14 minutes (once, not able to reproduce) when I processed the second half of the data (id % 2 = 1).
(1938525 row(s) affected)
2:50:00 (hours)
(1938079 row(s) affected)
00:14:00
I get pretty much the same execution plan when I remove the stuff() function and replace concat with MAX(). Also, I seem to get a much better execution plan when I add a mod function to final where clause.
I do not see the execution plan via this query. I am not sure how to interpret that.
select er.session_id,
ph.query_plan
from sys.dm_exec_requests er
CROSS APPLY sys.dm_exec_query_plan ( er.plan_handle ) ph
Environment Details:
Virtual Environment
Windows Server 2008 R2 Standard
SQL Server 2012 Developer (11.0.2100.60) x64
4 Processors (X5550 @ 2.67)
8 GB Ram
IO = Multiple data files (1 per virtual disk)
Schema:
The NameValue table is partitioned on “hour”.
2012-06-28 04:00:00.000
2012-06-28 05:00:00.000
2012-06-28 06:00:00.000
Cardinality.
raw.ptNameValue_1 = 850 million
stage.sku = 6 million
CREATE TABLE raw.ptNameValue_1 (
date_ SMALLDATETIME NOT NULL
,SkuID INT
,AttributeID INT
,DataBatchID INT
,Value VARCHAR(MAX)
,Value50 AS CONVERT(VARCHAR(50),Value)
, Hierarchy hierarchyid
)
CREATE CLUSTERED INDEX ic_CSVNameValue ON raw.ptNameValue_1 (date_,SkuID)
WITH (DATA_COMPRESSION=PAGE, SORT_IN_TEMPDB=ON) on ps_nvp_1(date_)
CREATE INDEX ix_CSVNameValue ON raw.ptNameValue_1 (Value50)
WITH (DATA_COMPRESSION=PAGE, SORT_IN_TEMPDB=ON) on ps_nvp_1(date_)
CREATE NONCLUSTERED COLUMNSTORE INDEX csindx_simple
ON raw.ptNameValue_1
(SkuID
,AttributeID
,DataBatchID
)
--use #a to filter on attributeid
--use #s to filter on skuid
;WITH p as (
SELECT p1.skuid,p1.attributeid , p1.value50
--select count(*)
from raw.ptNameValue_1 p1 with(nolock)
inner join #s s on s.skuid = p1.skuid --this is just a filter
inner join #a a on a.attributeid = p1.attributeid – this is just a filter
),results as (
select p1.skuid
,dbo.concat(case when p1.attributeid = 203 THEN isnull(CHAR(13) + CHAR(10) + value50,'') else '' end,'') [Col1]
,dbo.concat(case when p1.attributeid = 253 THEN isnull(CHAR(13) + CHAR(10) + value50,'') else '' end,'') [Col2]
,dbo.concat(case when p1.attributeid = 254 THEN isnull(CHAR(13) + CHAR(10) + value50,'') else '' end,'') [CoN]
from p p1
group by skuid)
--select count(*)
--from results
insert into FlatResults
select r.*
from results r
where skuid %2 = 1 --this forces a better execution plan