TSQL Challenge 14 - Solution by Steve Howard



-- steve_howard_tsqlchallenge_14.sql
/*  10/2/2009 - Steve Howard
 *  TSQL Challenge #14 - Identify the longest sequence of characters in a string
 *  http://beyondrelational.com/blogs/tc/archive/2009/09/28/tsql-challenge-1...
 *  
 *  Strategy:  This solution uses 3 CTE's.  First, the recursive (Streaks) CTE 
 *  builds up a list of all streaks found in each data.  This includes 
 *  streaks of length 1 and substreaks of longer streaks, necessitating the 
 *  second (GroupedStreaks) CTE.
 *  
 *  The second (GroupedStreaks) CTE takes the data from the Streaks CTE and 
 *  removes length 1 streaks and substreaks.  The GroupedStreaks CTE provides 
 *  all the data necessary for the final result set, however, it does not 
 *  include enough information to properly sort it.  (Sorting is first by 
 *  Data with longest streak, then by Data with the most streaks, then by 
 *  start Pos within each data.)  
 *  
 *  The third (StreakStats) CTE finds the longest streak and the total count 
 *  of streaks for each Data.  Joining this to the GroupedStreaks CTE on Data 
 *  allows proper sorting of the result set.  
 */  

--Sample Data
DECLARE @t TABLE (Data VARCHAR(40) )

INSERT @t (Data) SELECT '9992EDC6-D117-4DEE-B410-4E5FAE46AE97'
INSERT @t (Data) SELECT '0BFC936B-BD9A-4C6A-AFB2-CF3F1752F8B1'
INSERT @t (Data) SELECT '4A73E7EB-7777-4A04-9258-F1E75097977C'
INSERT @t (Data) SELECT '5AAF477C-274D-400D-9067-035968F33B19'
INSERT @t (Data) SELECT '725DA718-30D0-44A9-B36A-89F27CDFEEDE'
INSERT @t (Data) SELECT '8083ED5A-D3B9-4694-BB04-F0B09C588888'

--SELECT * FROM @t

--Recursive CTE to generate all streaks of the same letter/digit
--4 Columns:  
--	Data = the source Data element being looked at from @t
--	[Char] = the actual letter/digit for the current streak
--	Pos = the position (index) within Data at which the current streak began
--	[Len] = the length of the streak (number of same letter/digit in a row)
--
--Initial Conditions:  Start with the first letter of each Data, starting with 
--Pos = 1 and [Len] = 1.
--
--Each Recursion:  Compare the next character from Data (at index Pos+[Len]) 
--to the character of the current streak.  If the character matches, extend 
--the length of the streak by 1.  If it doesn't match, start a new streak for 
--the new character with [Len] = 1 and Pos = the current position, Pos+[Len].
--
--End condition:  Terminate when the current position, Pos+[Len], is greater 
--than the length of the source Data.  
;WITH Streaks (Data, [Char], Pos, [Len]) AS 
(
	SELECT Data, LEFT(Data, 1), 1, 1
	FROM @t
	UNION ALL
	SELECT	Data, 
			SUBSTRING(Data, Pos+[Len], 1), 
			CASE WHEN SUBSTRING(Data, Pos+[Len], 1) = [Char] THEN Pos ELSE Pos + [Len] END, 
			CASE WHEN SUBSTRING(Data, Pos+[Len], 1) = [Char] THEN [Len] + 1 ELSE 1 END 
	FROM Streaks 
	WHERE Pos+[Len] <= LEN(Data)  
), 
--GroupedStreaks CTE filters out streaks of Len = 1 and removes substreaks for 
--the same Data/[Char]/Pos by grouping them and keeping the MAX([Len]).  
GroupedStreaks (Data, [Char], Pos, Len) AS 
(
	SELECT Data, [Char], Pos, MAX([Len])
	FROM Streaks
	WHERE [Len] > 1
	GROUP BY Data, [Char], Pos
), 
--StreakStats CTE finds the longest streak and total count of streaks for each 
--Data from GroupedStreaks.  This is used for ordering the final result set.
StreakStats (Data, LongestStreak, StreakCount) AS 
(
	SELECT Data, MAX([Len]), COUNT([Len])
	FROM GroupedStreaks
	GROUP BY Data
)
--Final SELECT for result set
SELECT GS.Data, GS.[Char], GS.Pos, GS.[Len]
FROM GroupedStreaks GS
INNER JOIN StreakStats SS
	ON GS.Data = SS.Data
ORDER BY SS.LongestStreak DESC, SS.StreakCount DESC, GS.Pos