SQL query to extract file names from a html content

By
Advertisement

In this article I am going to explain how to extract the filename from html column using sql function.

Below is the function which i used to do this operation



CREATE FUNCTION dbo.fn_getFilenames(@InputHTML NVARCHAR(MAX))
RETURNS @res TABLE (pdf NVARCHAR(MAX)) AS
BEGIN
-- assumes there are no single quotes or double quotes in the PDF filename
DECLARE @i INT, @j INT, @k INT, @tmp NVARCHAR(MAX);
SET @i = CHARINDEX(N'.pdf', @InputHTML);
WHILE @i > 0
BEGIN
  SELECT @tmp = left(@InputHTML, @i+3);
  SELECT @j = CHARINDEX('/', REVERSE(@tmp)); -- directory delimiter
  SELECT @k = CHARINDEX('"', REVERSE(@tmp)); -- start of href
  IF @j = 0 or (@k > 0 and @k < @j) SET @j = @k;
  SELECT @k = CHARINDEX('''', REVERSE(@tmp)); -- start of href (single-quote*)
  IF @j = 0 or (@k > 0 AND @k < @j) SET @j = @k;
  INSERT @res VALUES (SUBSTRING(@tmp, len(@tmp)-@j+2, len(@TMP)));
  SELECT @InputHTML = STUFF(@InputHTML, 1, @i+4, ''); -- remove up to ".pdf"
  SET @i = CHARINDEX(N'.pdf', @InputHTML);
END
RETURN
END
GO

Below is the query to check the output

-- CREATE TABLE


create table mytable (Html varchar(max));

-- INSERT HTML Content

insert into mytable values('
<p>A deferred tuition payment plan,
or view the <a href="/uploadedFiles/uploadedFiles/uploadedFiles/uploadedFiles/Tuition-Reimbursement-Deferred.pdf"
target="_blank">list</a>.</p>')

insert into mytable values('
<p>A deferred tuition payment plan,
or view the <a href="Two files here-Reimbursement-Deferred.pdf"
target="_blank">list</a>.</p>And I use single quotes
   <a href=''/look/path/The second file.pdf''
target="_blank">list</a>');


--SELECT Statement

select t.*, p.pdf


from mytable t
cross apply dbo.extract_filenames_from_a_tags(html) p;

OUTPUT Will be:

SQL query to extract file names from a html content




0 comments:

Post a Comment

Online Casino