This article explains how to query and download PDF documents from the site openraadsinformatie.nl provided by Open State Foundation and VNG using Invantive SQL.
Instructions
To retrieve all PDF documents into a folder of Open Raadsinformatie using the following Invantive SQL statements:
create or replace table urls@inmemorystorage
as
select '' filename
, '' url
from dual@datadictionary
where false
create or replace table output@inmemorystorage
as
select '' filename
, unicode_to_blob(null) contents
from dual@datadictionary
where false
declare
--
-- History seems to start at August 1, 2017.
--
p_date_start date := to_date('20180101', 'YYYYMMDD');
p_date_end date := to_date('20180401', 'YYYYMMDD');
--
l_loop boolean;
l_date date;
l_date_c varchar2;
begin
l_date := p_date_start;
while l_date < p_date_end
loop
l_date_c := to_char(l_date, 'YYYY-MM-DD');
insert into urls@InMemoryStorage
( filename
, url
)
select case
when dta.contentType = 'application/pdf'
then normalize(to_char(dta.id) || '-' || dta.name || '.pdf', 240, false)
else dta.name
end
filename
, dta.originalUrl
from httpdownload@datadictionary
( url => 'https://api.openraadsinformatie.nl/v1/elastic/ori_*/_search?size=1000&q=date_modified:[' || l_date_c || ' TO ' || l_date_c || ']&pretty=false'
, method => 'GET'
) htp
join jsontable
( 'hits.hits.[*]'
passing htp.content_clob
columns id number path '_id'
, type varchar2 path '_type'
, name varchar2 path '_source.name'
, dateModified varchar2 path '_source.date_modified'
, originalUrl varchar2 path '_source.original_url'
, contentType varchar2 path '_source.content_type'
) dta
on dta.contenttype = 'application/pdf'
and dta.originalUrl like 'http%'
;
l_date := l_date + 1;
end loop;
end;
begin
for r in ( select * from URLS@InMemoryStorage )
loop
--
-- For now ignore all errors, since httpdownload forwards HTTP errors such as 404.
--
begin
insert into OUTPUT@InMemoryStorage
( filename
, contents
)
select r.filename
, fle.content_blob
from httpdownload@datadictionary
( url => r.url
, method => 'GET'
) fle
;
exception
when others
then
null;
end;
end loop;
end;
select *
from output@inmemorystorage
local export documents in contents to "c:\temp\ori" filename column filename