forked from xroche/httrack
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDockerfile
More file actions
80 lines (67 loc) · 2.6 KB
/
Dockerfile
File metadata and controls
80 lines (67 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.13
# Switch to root to install system packages
USER root
# Install HTTrack and required system dependencies
# Based on setup-wsl.sh and WSL_SETUP.md requirements
RUN echo "=== Installing HTTrack and system dependencies ===" \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
httrack \
wget \
curl \
ca-certificates \
zlib1g \
libssl3 \
&& echo "=== Verifying HTTrack installation ===" \
&& httrack --version \
&& echo "=== Cleaning up apt cache ===" \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& echo "=== HTTrack installation complete ==="
# Switch back to myuser for the rest of the build
USER myuser
# Create directory for scraped websites (output)
RUN mkdir -p /home/myuser/scraped_websites
# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY --chown=myuser:myuser requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY --chown=myuser:myuser . ./
# Copy scraper script and documentation
COPY --chown=myuser:myuser website_scraper.py ./
COPY --chown=myuser:myuser README_SCRAPER.md ./
COPY --chown=myuser:myuser SETUP_COMPLETE.txt ./
# Make scraper script executable
RUN chmod +x website_scraper.py
# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q src/
# Set environment variables for HTTrack
ENV HTTRACK_INSTALLED=1
ENV PATH="/usr/bin:${PATH}"
# Display versions for debugging
RUN echo "=== Environment Check ===" \
&& echo "Python: $(python --version)" \
&& echo "HTTrack: $(httrack --version | head -1)" \
&& echo "User: $(whoami)" \
&& echo "Working directory: $(pwd)" \
&& echo "======================="
# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]