Query and Download PDFs from Open Raadsinformatie

This article explains how to query and download PDF documents from the site openraadsinformatie.nl provided by Open State Foundation and VNG using Invantive SQL.

Instructions

To retrieve all PDF documents into a folder of Open Raadsinformatie using the following Invantive SQL statements:

create or replace table urls@inmemorystorage
as
select '' filename
,      '' url
from   dual@datadictionary
where  false

create or replace table output@inmemorystorage
as
select '' filename
,      unicode_to_blob(null) contents
from   dual@datadictionary
where  false

declare
  --
  -- History seems to start at August 1, 2017.
  --
  p_date_start date := to_date('20180101', 'YYYYMMDD');
  p_date_end   date := to_date('20180401', 'YYYYMMDD');
  --
  l_loop   boolean;
  l_date   date;
  l_date_c varchar2;
begin
  l_date := p_date_start;
  while l_date < p_date_end
  loop
    l_date_c := to_char(l_date, 'YYYY-MM-DD');
    insert into urls@InMemoryStorage
    ( filename
    , url
    )
    select case
           when dta.contentType = 'application/pdf'
           then normalize(to_char(dta.id) || '-' || dta.name || '.pdf', 240, false)
           else dta.name
           end
           filename
    ,      dta.originalUrl
    from   httpdownload@datadictionary
           ( url => 'https://api.openraadsinformatie.nl/v1/elastic/ori_*/_search?size=1000&q=date_modified:[' || l_date_c || ' TO ' || l_date_c || ']&pretty=false'
           , method => 'GET'
           ) htp
    join   jsontable
           ( 'hits.hits.[*]'
             passing htp.content_clob
             columns id                  number   path '_id'
             ,       type                varchar2 path '_type'
             ,       name                varchar2 path '_source.name'
             ,       dateModified        varchar2 path '_source.date_modified'
             ,       originalUrl         varchar2 path '_source.original_url'
             ,       contentType         varchar2 path '_source.content_type'
           ) dta
    on     dta.contenttype = 'application/pdf'
    and    dta.originalUrl like 'http%'
    ;
    l_date := l_date + 1;
  end loop;
end;

begin
  for r in ( select * from URLS@InMemoryStorage )
  loop
    --
    -- For now ignore all errors, since httpdownload forwards HTTP errors such as 404.
    --
    begin
      insert into OUTPUT@InMemoryStorage
      ( filename
      , contents
      )
      select r.filename
      ,      fle.content_blob
      from   httpdownload@datadictionary
             ( url => r.url
             , method => 'GET'
             ) fle
      ;
    exception
      when others 
      then 
        null;
    end;
  end loop;
end;

select * 
from   output@inmemorystorage

local export documents in contents to "c:\temp\ori" filename column filename