Skip to content

Commit 508b5b3

Browse files
committed
Initial commit:
new file: Dockerfile new file: getMedia.sh new file: startArchivers.sh new file: updateJson.sh
1 parent ac8deb5 commit 508b5b3

File tree

4 files changed

+144
-0
lines changed

4 files changed

+144
-0
lines changed

Dockerfile

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
FROM python:3-slim-bullseye
2+
RUN apt update && apt upgrade -y && apt install -y bash curl jq ffmpeg sed git --no-install-recommends
3+
RUN apt autopurge -y && apt clean -y
4+
5+
RUN pip install git+https://github.com/HoloArchivists/twspace-dl git+https://github.com/JustAnotherArchivist/snscrape.git
6+
7+
RUN mkdir -p /app/scripts
8+
RUN mkdir -p /app/output
9+
10+
COPY updateJson.sh /app/scripts/updateJson.sh
11+
COPY getMedia.sh /app/scripts/getMedia.sh
12+
COPY startArchivers.sh /app/scripts/startArchivers.sh
13+
14+
#RUN chown -R 1000:1000 /app
15+
RUN chmod -R 755 /app
16+
RUN chmod -R 777 /app/output
17+
RUN chmod +x /app/scripts/updateJson.sh
18+
RUN chmod +x /app/scripts/getMedia.sh
19+
RUN chmod +x /app/scripts/startArchivers.sh
20+
21+
#USER 1000
22+
23+
#RUN pip install git+https://github.com/HoloArchivists/twspace-dl git+https://github.com/JustAnotherArchivist/snscrape.git
24+
25+
WORKDIR /app/output
26+
27+
CMD ["/app/scripts/startArchivers.sh"]

getMedia.sh

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
jsonFile="${1}"
3+
outFolder="${2}"
4+
#echo "Reading from: ${jsonFile}"
5+
6+
pictures=()
7+
videos=()
8+
9+
#while IFS= read -r LINE; do
10+
pictures+=($(jq -r '.media[].fullUrl' "${jsonFile}" | tr -d '[],"'))
11+
videos+=($(jq -r '.media[].variants|sort_by(.bitrate)[-1].url' "${jsonFile}" | tr -d '[],"'))
12+
13+
#done < "$jsonFile"
14+
15+
for url in ${pictures[@]}
16+
do
17+
removeHTTP="${url#*//}"
18+
# echo "Remove HTTP: $removeHTTP"
19+
format="${url#*format=}"
20+
format="${format%&*}"
21+
removeFormat="${removeHTTP%?format=*}"
22+
# echo "Remove format: $removeFormat"
23+
echo "$url - Saving to: ${outFolder}/${removeFormat}.${format}"
24+
curl "${url}" --create-dirs -o "${outFolder}/${removeFormat}.${format}"
25+
done
26+
27+
for url in ${videos[@]}
28+
do
29+
removeHTTP="${url#*//}"
30+
removeTag="${removeHTTP%?tag=*}"
31+
echo "Writing to destination: ${outFolder}/${removeTag}"
32+
curl "${url}" --create-dirs -o "${outFolder}/${removeTag}"
33+
done

startArchivers.sh

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
#If no interval set, default to 60s
3+
if [[ -z "${interval}" ]]; then
4+
interval=60
5+
fi
6+
7+
unameA=($usernames)
8+
9+
for u in ${unameA[@]}; do
10+
bash /app/scripts/updateJson.sh "${u}" "/app/output" $interval &
11+
PIDs+=$!
12+
done
13+
14+
wait
15+
16+
echo "All scripts ended, exitting in 10 seconds"
17+
sleep 10
18+
19+
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT

updateJson.sh

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/bin/bash
2+
twitterUN="${1}"
3+
outFolder="${2}/${1}"
4+
interval=${3}
5+
spacesPath="${outFolder}/spaces"
6+
COOKIES_PATH="${2}/cookies.txt"
7+
echo "$outFolder"
8+
9+
#If downloading media was interrupted, reget new media
10+
if [ -s "${outFolder}/${twitterUN}-tweets.json.new" ]; then
11+
/app/scripts/getMedia.sh "${outFolder}/${twitterUN}-tweets.json.new" "${outFolder}"
12+
mv -f "${outFolder}/${twitterUN}-tweets.json.new" "${outFolder}/${twitterUN}-tweets.json"
13+
if [ -f "${outFolder}/${twitterUN}-tweets.json.bkup" ]; then
14+
cat "${outFolder}/${twitterUN}-tweets.json.bkup" >> "${outFolder}/${twitterUN}-tweets.json"
15+
fi
16+
fi
17+
18+
19+
while true
20+
do
21+
#Make output folder if not existing
22+
if ! [ -d "${outFolder}" ]; then
23+
mkdir -p "${outFolder}"
24+
fi
25+
26+
#Start download of any available spaces
27+
if [ ! -f "$COOKIES_PATH" ]; then
28+
echo "Starting without cookies"
29+
twspace_dl -U "https://twitter.com/${twitterUN}" --write-url "${spacesPath}/master_urls.txt" -m -p -o "${spacesPath}/[%(creator_screen_name)s]-%(title)s|%(start_date)s"
30+
else
31+
twspace_dl -U "https://twitter.com/${twitterUN}" --write-url "${spacesPath}/master_urls.txt" --input-cookie-file "$COOKIES_PATH" -o "${spacesPath}/[%(creator_screen_name)s]-%(title)s|%(start_date)s"
32+
fi
33+
34+
#Archive tweets
35+
if ! [ -f "${outFolder}/${twitterUN}-tweets.json" ]; then
36+
#if no existing archive
37+
snscrape --jsonl twitter-profile "${twitterUN}" > "${outFolder}/${twitterUN}-tweets.json.new"
38+
/app/scripts/getMedia.sh "${outFolder}/${twitterUN}-tweets.json.new" "${outFolder}"
39+
mv -f "${outFolder}/${twitterUN}-tweets.json.new" "${outFolder}/${twitterUN}-tweets.json"
40+
else
41+
#If existing, only grab after latest date
42+
dateOne=$(jq -r '[.date] | @tsv' <<< $(sed -n '1{p;q;}' "${outFolder}/${twitterUN}-tweets.json"))
43+
dateTwo=$(jq -r '[.date] | @tsv' <<< $(sed -n '2{p;q;}' "${outFolder}/${twitterUN}-tweets.json"))
44+
# echo "Date one: $dateOne"
45+
# echo "Date one: $dateTwo"
46+
dateA=$(date -d "$dateOne" +"%s")
47+
dateB=$(date -d "$dateTwo" +"%s")
48+
dateSince=0
49+
if [ "$dateA" -ge "$dateB" ]; then
50+
dateSince=$dateA
51+
else
52+
dateSince=$dateB
53+
fi
54+
dateSince=$(date -d @$(($dateSince)) +"%Y-%m-%d %H:%M:%S %z")
55+
snscrape --since "${dateSince}" --jsonl twitter-profile "${twitterUN}" > "${outFolder}/${twitterUN}-tweets.json.new"
56+
57+
if [ -s "${outFolder}/${twitterUN}-tweets.json.new" ]; then
58+
cp -f "${outFolder}/${twitterUN}-tweets.json" "${outFolder}/${twitterUN}-tweets.json.bkup"
59+
/app/scripts/getMedia.sh "${outFolder}/${twitterUN}-tweets.json.new" "${outFolder}"
60+
mv -f "${outFolder}/${twitterUN}-tweets.json.new" "${outFolder}/${twitterUN}-tweets.json"
61+
cat "${outFolder}/${twitterUN}-tweets.json.bkup" >> "${outFolder}/${twitterUN}-tweets.json"
62+
fi
63+
fi
64+
sleep $(($interval))
65+
done

0 commit comments

Comments
 (0)