Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 26 additions & 33 deletions src/TableScraper.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
module TableScraper

using Gumbo: parsehtml
import Gumbo
using Cascadia: @sel_str, nodeText
import HTTP;
import HTTP

export scrape_tables

Expand Down Expand Up @@ -39,58 +39,51 @@ The `TableScraper.Table` is a Tables.jl-compatible row-accessible type. So you c
another Tables.jl compatible type if you wish e.g. `DataFrame.(scrape_tables(url))` will return a
vector of `DataFrame`s
"""
function scrape_tables(url, cell_transform=nodeText, header_transform=nodeText)::Vector{Table}
result_tables = []

function scrape_tables(url, args...)::Vector{Table}
response::HTTP.Messages.Response =
try
response = HTTP.get(url)
HTTP.get(url)
catch e
println("Error when attempting to get $url. Make you are connected to the internet and the URL is accessible")
raise(e)
end

# the body is the html content
parsed_html = parsehtml(String(response.body))

return scrape_tables(Gumbo.parsehtml(String(response.body)), args...)
end

function scrape_tables(parsed_html::Gumbo.HTMLDocument, args...)::Vector{Table}
# look for tables in the parsed_html
tables_elems = eachmatch(sel"table", parsed_html.root)

n_tables = length(tables_elems)

headers = [[] for _ in 1:n_tables]

return [scrape_table(table_elem, args...) for table_elem in eachmatch(sel"table", parsed_html.root)]
end

for (header, table_elem) in zip(headers, tables_elems)
for header1 in eachmatch(sel"tr th", table_elem)
# check the header span
# you are on your won if you don't use nodeText
if (nodeText == header_transform) & haskey(header1.attributes, "colspan")
colspan = parse(Int, header1.attributes["colspan"])
for i in 1:colspan
push!(header, nodeText(header1)*"$i")
end
else
push!(header, header_transform(header1))
function scrape_table(table_elem::Gumbo.HTMLElement{:table}, cell_transform=nodeText, header_transform=nodeText)::Table
header = []
for header1 in eachmatch(sel"tr th", table_elem)
# check the header span
# you are on your own if you don't use nodeText
if (nodeText == header_transform) & haskey(header1.attributes, "colspan")
colspan = parse(Int, header1.attributes["colspan"])
for i in 1:colspan
push!(header, nodeText(header1)*"$i")
end
else
push!(header, header_transform(header1))
end
end

result_tables = [[] for _ in 1:n_tables]
result_arr = []

# parse the tables
# Fill dataframe with rows from the table
for (results_arr, table_elem) in zip(result_tables, tables_elems)
for row in eachmatch(sel"tbody tr", table_elem)
tds = cell_transform.(eachmatch(sel"td", row))
if length(tds) > 0 # likely to be header column so don't add
push!(results_arr, tds)
end
for row in eachmatch(sel"tbody tr", table_elem)
tds = cell_transform.(eachmatch(sel"td", row))
if length(tds) > 0 # likely to be header column so don't add
push!(result_arr, tds)
end
end

[Table(rows, header) for (rows, header) in zip(result_tables, headers)]
return Table(result_arr, header)
end

end