scrape_guilds.py: use playwright now
This commit is contained in:
parent
a233cfa432
commit
5d7349062d
24
flake.nix
24
flake.nix
|
@ -125,30 +125,8 @@
|
||||||
python3Packages.aiohttp
|
python3Packages.aiohttp
|
||||||
python3Packages.black
|
python3Packages.black
|
||||||
python3Packages.ipython
|
python3Packages.ipython
|
||||||
python3Packages.trio
|
|
||||||
python3Packages.gql
|
python3Packages.gql
|
||||||
(python3Packages.selenium.overrideAttrs (old: {
|
python3Packages.playwright
|
||||||
postInstall =
|
|
||||||
old.postInstall
|
|
||||||
+ ''
|
|
||||||
for ver in v85 v126 v127 v128; do
|
|
||||||
DEVTOOLS=../common/devtools/chromium/$ver
|
|
||||||
for proto in js browser; do
|
|
||||||
python3 ../common/devtools/convert_protocol_to_json.py \
|
|
||||||
$DEVTOOLS/"$proto"_protocol.pdl \
|
|
||||||
--map_binary_to_string=true \
|
|
||||||
$DEVTOOLS/"$proto"_protocol.json
|
|
||||||
done
|
|
||||||
mkdir -p $DST_PREFIX/common/devtools/$ver
|
|
||||||
python3 generate.py \
|
|
||||||
$DEVTOOLS/browser_protocol.json \
|
|
||||||
$DEVTOOLS/js_protocol.json \
|
|
||||||
$DST_PREFIX/common/devtools/$ver
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
nativeBuildInputs = old.nativeBuildInputs ++ [ python3Packages.inflection ];
|
|
||||||
}))
|
|
||||||
];
|
];
|
||||||
|
|
||||||
CHROME = "${chromium}/bin/chromium";
|
CHROME = "${chromium}/bin/chromium";
|
||||||
|
|
|
@ -3,82 +3,59 @@
|
||||||
Scrape guilds with selenium
|
Scrape guilds with selenium
|
||||||
You probably shouldn't use this, but if you really must:
|
You probably shouldn't use this, but if you really must:
|
||||||
* Set the CHROME environment variable to the path to your chrome or chromium
|
* Set the CHROME environment variable to the path to your chrome or chromium
|
||||||
* You probably want to connect to an existing chrome instance so you can log in first,
|
|
||||||
run `chromium --user-data-dir=$HOME/.cache/chromium-emoji-script --remote-debugging-port=9222`
|
|
||||||
to start chromium then `env CHROME=$(command -v chromium) ./scrape_guilds.py 127.0.0.1:99222`
|
|
||||||
* It just gives you json, you have to dump later
|
* It just gives you json, you have to dump later
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
import os.path
|
|
||||||
import os
|
import os
|
||||||
|
import os.path
|
||||||
|
import pathlib
|
||||||
import sys
|
import sys
|
||||||
import zlib
|
import zlib
|
||||||
|
|
||||||
from selenium import webdriver
|
from playwright.async_api import async_playwright
|
||||||
import trio
|
import asyncio
|
||||||
|
|
||||||
|
|
||||||
def setup_driver():
|
def websocket_handler(out_file):
|
||||||
options = webdriver.ChromeOptions()
|
async def inner(ws):
|
||||||
if location := os.getenv("CHROME"):
|
print(f"Got websocket at `{ws.url}`")
|
||||||
options.binary_location = location
|
|
||||||
options.add_argument("--start-maximized")
|
|
||||||
options.add_argument(
|
|
||||||
"--user-data-dir=" + os.path.expanduser("~/.cache/chromium-emoji-script")
|
|
||||||
)
|
|
||||||
if len(sys.argv) > 1:
|
|
||||||
options.add_experimental_option("debuggerAddress", sys.argv[1])
|
|
||||||
|
|
||||||
# options.add_experimental_option("detach", True)
|
decompress = zlib.decompressobj()
|
||||||
return webdriver.Chrome(options=options)
|
|
||||||
|
|
||||||
|
async def handle_message(msg):
|
||||||
async def handle_events(listener, out_file):
|
message = json.loads(decompress.decompress(msg))
|
||||||
valid_id = None
|
|
||||||
decompress = None
|
|
||||||
|
|
||||||
async for event in listener:
|
|
||||||
typ = event.__class__.__name__
|
|
||||||
if typ == "WebSocketCreated":
|
|
||||||
if event.url.startswith("wss://gateway.discord.gg/"):
|
|
||||||
valid_id = event.request_id
|
|
||||||
decompress = zlib.decompressobj()
|
|
||||||
elif typ == "WebSocketFrameReceived":
|
|
||||||
if event.request_id != valid_id:
|
|
||||||
continue
|
|
||||||
message = json.loads(
|
|
||||||
decompress.decompress(base64.b64decode(event.response.payload_data))
|
|
||||||
)
|
|
||||||
message_type = message.get("t")
|
message_type = message.get("t")
|
||||||
await out_file.write(json.dumps(message) + "\n")
|
out_file.write(json.dumps(message) + "\n")
|
||||||
# The data we actually want, might as well flush
|
# The data we actually want, might as well flush
|
||||||
if message_type in ["READY", "GUILD_CREATE"]:
|
if message_type in ["READY", "GUILD_CREATE"]:
|
||||||
await out_file.flush()
|
out_file.flush()
|
||||||
|
|
||||||
print("Got message of type", message_type)
|
print("Got message of type", message_type)
|
||||||
|
|
||||||
|
ws.on("framereceived", handle_message)
|
||||||
|
|
||||||
|
return inner
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
out_path = await trio.Path.cwd() / "out" / "discord" / "events.json"
|
out_path = pathlib.Path.cwd() / "out" / "discord" / "events.json"
|
||||||
await out_path.parent.mkdir(parents=True, exist_ok=True)
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
out_file = await out_path.open("a")
|
out_file = out_path.open("a")
|
||||||
|
|
||||||
driver = setup_driver()
|
async with async_playwright() as p:
|
||||||
async with driver.bidi_connection() as conn:
|
browser = await p.chromium.launch_persistent_context(
|
||||||
devtools, session = conn.devtools, conn.session
|
user_data_dir=os.path.expanduser("~/.cache/chromium-emoji-script"),
|
||||||
|
executable_path=os.getenv("CHROME"),
|
||||||
await session.execute(devtools.network.enable())
|
headless=False,
|
||||||
listener = session.listen(
|
|
||||||
devtools.network.WebSocketCreated,
|
|
||||||
devtools.network.WebSocketFrameReceived,
|
|
||||||
buffer_size=1024,
|
|
||||||
)
|
)
|
||||||
|
page = await browser.new_page()
|
||||||
|
page.on("websocket", websocket_handler(out_file))
|
||||||
|
await page.goto("https://discord.com/app")
|
||||||
|
|
||||||
await handle_events(listener, out_file)
|
await asyncio.Future()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
trio.run(main)
|
asyncio.run(main())
|
||||||
|
|
Loading…
Reference in a new issue