TSQL Challenge 13 - Solution by Jeff Moden



--===== This is the original data setup script posted at...
     -- http://beyondrelational.com/blogs/tc/archive/2009/08/31/tsql-challenge-1...
     -- The only thing that has been changed is some minor formatting for readablity
DECLARE @t TABLE 
        (
        InvID         INT IDENTITY,
        BatchNumber   INT,
        InvoiceNumber INT, 
        VisitDate     DATETIME,
        Amount        MONEY
        )
 INSERT INTO @t(BatchNumber, InvoiceNumber, VisitDate, Amount)
 SELECT 10000001,20001,'2009-01-01',50.00 UNION ALL
 SELECT 10000001,20001,'2009-01-02',50.00 UNION ALL
 SELECT 10000001,20001,'2009-01-03',50.00 UNION ALL
 SELECT 10000001,20001,'2009-01-04',50.00 UNION ALL
 SELECT 10000001,20002,'2009-01-01',50.00 UNION ALL
 SELECT 10000001,20002,'2009-01-02',50.00 UNION ALL
 SELECT 10000001,20002,'2009-01-03',50.00 UNION ALL
 SELECT 10000001,20003,'2009-01-01',50.00 UNION ALL
 SELECT 10000001,20003,'2009-01-02',50.00 UNION ALL
 SELECT 10000001,20004,'2009-01-01',50.00 UNION ALL
 SELECT 10000001,20004,'2009-01-02',50.00 UNION ALL
 SELECT 10000001,20004,'2009-01-03',50.00 UNION ALL
 SELECT 10000001,20004,'2009-01-04',50.00 UNION ALL
 SELECT 10000001,20004,'2009-01-05',50.00 UNION ALL
 SELECT 10000001,20004,'2009-01-06',50.00 UNION ALL
 SELECT 10000001,20005,'2009-01-01',50.00 UNION ALL
 SELECT 10000001,20005,'2009-01-02',50.00 UNION ALL
 SELECT 10000001,20005,'2009-01-13',50.00 UNION ALL
 SELECT 10000001,20005,'2009-01-14',50.00 UNION ALL
 SELECT 10000001,20005,'2009-01-15',50.00 UNION ALL
 SELECT 10000001,20005,'2009-01-06',50.00 UNION ALL
 SELECT 10000001,20005,'2009-01-07',50.00 UNION ALL
 SELECT 10000001,20005,'2009-01-08',50.00 UNION ALL
 SELECT 10000001,20006,'2009-01-01',50.00 UNION ALL
 SELECT 10000002,20007,'2009-01-01',50.00 UNION ALL
 SELECT 10000002,20007,'2009-01-02',50.00 UNION ALL
 SELECT 10000002,20007,'2009-01-03',50.00 UNION ALL
 SELECT 10000002,20008,'2009-01-01',50.00 UNION ALL
 SELECT 10000002,20008,'2009-01-02',50.00 UNION ALL
 SELECT 10000002,20008,'2009-01-03',50.00
;

--====================================================================
--      Prepare the work table
--====================================================================
--===== Conditionally drop the work table.
     -- You don't need to do this in a stored procedure because Temp tables only have a narrow scope
     -- and they will drop automatically at the end the proc that created them.  I'm doing the drop
     -- here just to make reruns of this code a total "no brainer".
     IF OBJECT_ID('TempDB..#Work','U') IS NOT NULL
        DROP TABLE #Work
;
--===== Create the work table on the fly while populating it with data from the original table.
     -- Note the creation of two new columns.  MyCount uses ROW_NUMBER() to determine how many 
     -- invoices there are for each invoice.  The "first" entry for each invoice will contain the
     -- number of entries for that invoice so we don't need to have a bunch of correlated sub
     -- queries to effectively count how many items there are for each invoice before we pass
     -- through all the invoices.  The other new column is the SET column as stipulated in the 
     -- requirements for this challenge.  Note that I wouldn't have picked a reserved word for a
     -- column name, but it is what it is.  Also note that SELECT/INTO is super fast even if
     -- used in the FULL RECOVERY MODE.  Depending on the machine and the data copied, it can
     -- copy a million rows AND create the target table in just a second or two.
     --
     -- Another little trick.  I need for BatchNumber and InvoiceNumber to be NOT NULL just
     -- like InvID will be because it's an IDENTITY column in the original table.  Notice how
     -- I use ISNULL to accomplish that while copying the data.
     --
     -- Also, since ANSI compliance on this code is out the window anyway, I decided to make
     -- the code VERY readable with the columnalias = expression method.  And, NO, that method
     -- has NOT been deprecated.  'string' = expression is the method that has been deprecated.
     -- Please refer to the 2008 deprecation list in Books Online for proof.
 SELECT InvID, 
        BatchNumber   = ISNULL(BatchNumber,0), 
        InvoiceNumber = ISNULL(InvoiceNumber,0), 
        VisitDate, 
        Amount,
        MyCount       = ROW_NUMBER() OVER (PARTITION BY BatchNumber, InvoiceNumber 
                                               ORDER BY BatchNumber, InvoiceNumber, InvID DESC),
        [Set]         = CAST(0 AS INT)
   INTO #Work 
   FROM @t
;
--===== This is the "magic" dust we need to make this bit of code fly.  This is the required
     -- Clustered index I spoke of earlier.  It must be in the order we expect the "Pseudo
     -- Cursor" update to work in because it will always follow that order when a "Quirky
     -- Update" is used.  Despite a million warnings from various other MVP's, this method
     -- has worked correctly since the days of Sybase and, if properly setup and other "rules"
     -- are followed, NO ONE has ever been able to make this method fail either by accident
     -- or intentionally.  It works in all versions of SQL Server.
  ALTER TABLE #Work
    ADD PRIMARY KEY CLUSTERED (BatchNumber, InvoiceNumber, InvID) WITH FILLFACTOR = 100
;
--===== Just doing a sanity check here so that everyone can see what the MyCount column
     -- contains.  Uncomment this section of code if you want to see it.
-- SELECT * 
--   FROM #Work
--  ORDER BY InvID
--;
--====================================================================
--      Solve the problem as defined in the requirements of Challenge #13 using the "Quirky Update".
--      It's a true "set-based" solution that uses a handful of local working variables and a single
--      UPDATE that uses the "3 part" Quirky Update method whose syntax is documented but not 
--      explained in Books Online.
--
--      I've also made it so that batch size is adjustable by changing the value of a single
--      variable.  This variable could be populated by the content of a table according to vendor
--      and batch type.  For the purposes of this challenge, I've hardcoded the value assignment
--      to "10".  No error checking has been done for this value because the value SHOULD be checked
--      by a CHECK constraint on the recommended "Configuration" table I just mentioned.
--
--      This code will do the job on a million rows in just several seconds.  It's one of the 
--      fastest methods known to do calculated "data smears" like those found in the requirements
--      for this challenge.
--
--      Just to say it again, NO ONE has ever been able to make a properly formed "Quirky Update"
--      that follows the rules fail.
--====================================================================
--===== Declare some obviously named local variables.
     -- Note that these are in alphabetical order to make finding a variable definition easier to do
     -- when troubleshooting.  This is especially important if there are a lot of variables.
DECLARE @BatchSize         INT,
        @CurrentSet        INT,
        @PrevBatchNumber   INT,
        @PrevInvoiceNumber INT,
        @RunningCount      INT
;
--===== Preset the variables including the programmable batch size.
     -- Since the "Quirky Update" isn't exactly ANSI compliant, I threw ANSI compliance out the 
     -- bloody door here.  We're going for speed and readability here just like we did for the 
     -- DECLARE!
 SELECT @BatchSize         = 10,
        @CurrentSet        =  0,
        @PrevBatchNumber   =  0,
        @PrevInvoiceNumber =  0,
        @RunningCount      =  0
;
--===== All set... do the "Quirky Update" just like we would if we were doing this in VB or C#...
     -- Basically, read a row, set some variables according to some decisions, update the SET
     -- column based on those decisions, and set some more variables to remember the current
     -- row's data for when we process the next row.  The details are in the comments in the
     -- code below.
     --
     -- Notice that all of the CASE statements have been "short ciruited" to take the most
     -- likely condition first to maximize performance and reduce CPU usage.
 UPDATE #Work
    SET @RunningCount      = CASE 
                           --==== If same batch/invoice then no change in running count 
                                  WHEN BatchNumber    = @PrevBatchNumber
                                   AND InvoiceNumber  = @PrevInvoiceNumber
                                  THEN @RunningCount
                           --==== If different batch/invoice then add new count to "running count"
                                  ELSE @RunningCount  + MyCount
                              END,
        @CurrentSet        = [Set] -- 3 part "quirky" update (@var = ColName = expression)
                           = CASE  -- This 3 part update changes the value of "SET" in the table.
                           --==== If same batch & running count <= batch size then no change in "SET"
                                  WHEN BatchNumber    = @PrevBatchNumber
                                   AND @RunningCount <= @BatchSize
                                  THEN @CurrentSet
                           --==== If same batch & running count > batch size then add 1 to "SET".
                                  WHEN BatchNumber   = @PrevBatchNumber
                                  THEN @CurrentSet   + 1
                           --==== Otherwise, batch has changed and we need to start at 1
                                  ELSE 1
                              END,
        @RunningCount      = CASE
                           --==== If the running count hasn't exceeded the batch size,
                               -- we keep the same running count.  If the running count
                               -- HAS exceeded the batch size, we've already updated rows
                               -- with a new SET value and we have to reset the running 
                               -- count to remember the rows wejust processed.  Notice 
                               -- that this is the SECOND time we're updating the running
                               -- count variable in this update.  SQL Server allows it
                               -- even when it's not a Quirky Update.
                               -- Heh... like I said, throwing ANSI out the door because
                               -- we're in this for performance.  Besides, the requirements
                               -- said that we need this to run in 2005/2008.
                                  WHEN @RunningCount <= @BatchSize
                                  THEN @RunningCount
                                  ELSE MyCount
                              END,
        @PrevBatchNumber   = BatchNumber,  --Remember current row for next row to be processed
        @PrevInvoiceNumber = InvoiceNumber --Remember current row for next row to be processed
   FROM #Work WITH (TABLOCKX) --TABLOCKX makes just one lock instead of 
 OPTION (MAXDOP 1) --You MUST prevent parallelism for a "Quirky Update" just like in VB or C#
;
--===== Do the "final" SELECT to produce the desired result set according to the requirements
     -- of Challenge 13.
 SELECT InvID, 
        BatchNumber, 
        InvoiceNumber, 
        VisitDate, 
        Amount,
        [Set]
   FROM #Work
  ORDER BY InvID  --NEVER EVER trust the order of a SELECT to a Clustered Index even 
                  --though this sort will NOT show up in the Execution Plan! That type
                  --of stuff ONLY works with the "Quirky Update".  TRUST ME ON THIS!!!
;
--===================================================================
--      Verification Code
--      Heh... to quote Sergiy from SQLServerCentral.com, "A Developer must NOT guess...
--      A Developer must KNOW."  In the name of all that is "holy" in the world of IT, here's a
--      bit of formatted test code to make it easy on the judges to visually verify that all is
--      well in "Quirky Update" land.  This method could easily be adapted as an automated checker.
--      Even with this test, the "Quirky Update" will still beat any cursor and most other "Set
--      Based" methods (except for one possibility that I haven't figured out, yet) that are capable
--      of doing "previous row" calculations like those required to meet this challenge.
--      
--      Yep... I know... reporting is normally the wrong thing to do in SQL especially with the
--      "right justification" I did.  But, if you don't have anything else, now you know how
--      to do it (take a look at the output in the TEXT mode instead of the GRID mode.  Besides,
--      it was fun and I have to get my "Brownie Points" in with the judges by making life easier
--      for them. ;-)  
--
--      As a sidebar, this type of "report" could easily be inserted into a table (without the 
--      fancy justification and skipped lines) so that an automated post run analysis could be 
--      done to give doubters of the "Quirky Update" the nice warm fuzzies that it works every 
--      time.  Even with said analysis, the "Quirky Update" method will still beat most all other
--      methods including all cursors and 99% of all other set based methods. 
--
--      Since GROUPING() in a GROUP BY is well beyond this challenge, I'll leave it up to the reader
--      to look it up in Books Online.  Look under GROUP BY, CUBE... it's not under ROLLUP as you
--      think it might be.
--
--      Before anyone get's started on me about it, COUNT(*) is just as effective as COUNT(1).
--      Unless you have code to prove otherwise, please don't bring it up because I have code to
--      prove they're the same. I also have a bucket of pork chops at the ready.  ;-)
--===================================================================
 SELECT 'Validation Table Follows...'   AS Notice
;
WITH cteConvert AS
(
 SELECT BatchNumber   = STR(BatchNumber,11),
        InvoiceNumber = STR(InvoiceNumber,13),
        [Set]         = STR([Set],7),
        Amount 
   FROM #Work
)
 SELECT BatchNumber   = CASE
                            WHEN GROUPING(BatchNumber) = 0 
                            THEN BatchNumber
                            ELSE ''
                        END,
        InvoiceNumber = CASE
                            WHEN GROUPING(InvoiceNumber) = 0 
                            THEN InvoiceNumber
                            ELSE ''
                        END,
        [  Set]       = CASE
                            WHEN GROUPING([Set]) = 0 
                            THEN [Set]
                            ELSE ''
                        END,
        LineType      = CASE
                            WHEN GROUPING(BatchNumber) = 0 AND GROUPING(InvoiceNumber) = 0 AND GROUPING([Set]) = 0
                            THEN 'SubTotal for Invoice/Set' 

                            WHEN GROUPING(BatchNumber) = 0 AND GROUPING(InvoiceNumber) = 0 AND GROUPING([Set]) = 1
                            THEN 'SubTotal for Invoice'

                            WHEN GROUPING(BatchNumber) = 0 AND GROUPING(InvoiceNumber) = 1 AND GROUPING([Set]) = 1
                            THEN 'SubTotal for BatchNumber'

                            WHEN GROUPING(BatchNumber) = 1 AND GROUPING(InvoiceNumber) = 1 AND GROUPING([Set]) = 1
                            THEN 'Grand Total'
                        END,
        [ Qty]        = STR(COUNT(*),5),
        TotalAmount   = STR(SUM(Amount),11,2),
        [ ]           = CASE --Heh... Guess what this does... hope you know something about ASCII characters. ;-) 
                            WHEN GROUPING([Set]) = 0 
                            THEN ''
                            ELSE CHAR(10) --CHAR(10) is the LineFeed or NewLine character.
                        END
   FROM cteConvert
  GROUP BY 
        BatchNumber,
        InvoiceNumber,
        [Set]
   WITH ROLLUP