Implementing a lecturer IS scraper
[programming
]
For more than two years, I’ve been working as a mathematics lecturer at Populo. Kabinet, the information system we’re using, is good enough for basic agenda, but Kabinet does not provide the opportunity to actively monitor courses (think of a watchdog). At least that was the case in July 2024.
So I’ve decided to reverse-engineer the Kabinet API and to my surprise, I had a working scraper prototype after one weekend.
Authentication & Cognito
First step, of course, was to figure out what authentication method is used so that I can retrieve a JWT token which is used in the rest of the Kabinet REST API.
These endpoints were called after a successful login:
So it’s quite obvious the backend makes use of the Cognito AWS service to authenticate and authorize its users.
Since I didn’t have much knowledge about Cognito, I wasn’t able to figure out what the primary source of the connection pool configuration was, so I decided to parse this setting from the main JavaScript where it is for some reason hardcoded:
...
const yxt = {
Auth: {
Cognito: {
userPoolId: "eu-central-1_LyA4D7N6q",
identityPoolId: "eu-central-1:edfb5190-0a0a-4b58-bcff-749e12065953",
userPoolClientId: "3q7sbbp4u743ot2181ou1r8bdf"
}
}
};
...
Since the script is obfuscated, I also needed to have a set of fallback values in my configuration that will be used if the Kabinet developers decide to set the code obfuscation properly. :))
Kabinet API
Good, now we have a usable JWT through which we can fetch a list of available courses, no rocket science here:
from typing import List, Any
import requests
from mypy_boto3_cognito_idp import CognitoIdentityProviderClient
from mypy_boto3_cognito_idp.type_defs import AuthenticationResultTypeTypeDef
AVAILABLE_COURSES_URL = "{api_prefix}/tutor/{user}/available-courses"
def list_available_courses(
cognito_client: CognitoIdentityProviderClient,
auth_result: AuthenticationResultTypeTypeDef,
api_prefix: str,
) -> List[Any]:
user = cognito_client.get_user(AccessToken=auth_result["AccessToken"])["Username"]
courses = requests.get(
AVAILABLE_COURSES_URL.format(user=user, api_prefix=api_prefix),
headers={
"Accept": "application/json",
"Authorization": f"Bearer {auth_result['IdToken']}",
# override default rubbish sent from session-level
"User-Agent": None,
"Connection": None,
"Accept-Encoding": None,
},
)
return list(courses.json()["data"])
This already gave me a very usable response, I anonymised the payload for confidentiality reasons:
{
"data": [
{
"externalId": "<ID>",
"topic": {
"externalId": "<ID>",
"value": "Anglický jazyk"
},
"number": 12345,
"firstMinute": false,
"studyStart": "2024-05-06T00:00:00.000Z",
"studyReason": null,
"schoolType": {
"externalId": "<ID>",
"value": "Základní škola"
},
"studyYear": {
"externalId": "<ID>",
"value": "9."
},
"requiredLevel": {
"externalId": "<ID>",
"value": "ZŠ",
"index": 1
},
"requiredQualification": null,
"language": {
"externalId": "<ID>",
"value": "Czech (Czech Republic)"
},
"branch": {
"externalId": "<ID>",
"value": "Zlín",
...
},
"timeslots": [
{
"externalId": "<ID>",
"startTime": "2024-04-27T15:00:00.000Z",
"dueTime": "2024-04-27T20:15:00.000Z",
"studyType": {
"externalId": "<ID>",
"value": "Prezenčně"
},
"weekday": {
"externalId": "<ID>",
"value": "Pátek",
"number": 5
},
"notes": "PREFERUJ%C3%8D+%2F%2F1x+45min"
},
...
],
"rate": "140.0000",
"rateBonus": "20.0000",
"rateType": {
"externalId": "<ID>",
"value": "Hod. sazba"
},
"interested": false,
"stared": false,
"interestedLength": 0
},
...
]
}
Basically, the only thing left was to filter out the payload only to interesting courses, e.g. courses which match my arbitrary criteria. In my case, these are the criteria which I hardcoded in code because I couldn’t be bothered writing a persistence layer:
- topic: Mathematics
- school type: High School
- branch: Brno
Once I filter out courses, all I need to know are the following data which help me decide whether the course is really for me:
from pydantic import BaseModel
class CourseTimeSlot(BaseModel):
startHour: int
endHour: int
weekDay: str
class InterestingAvailableCourse(BaseModel):
start: date
schoolType: str
studyYear: str
timeSlots: List[CourseTimeSlot]
As you might suspect, the result of the Kabinet API scraping is a list of InterestingAvailableCourse
. The only thing left to do is to send the data somewhere.
Notifications
The most tricky part of the scraper was to determine how and where I will send notifications.
I was not able to come up with an easy way of sending notifications to my iPhone, so came up with two ideas:
- if my machine is on the same network where my scraper is running, send a Dunst notification to a Dunst daemon via SSH
- if my machine is not on the network, send an e-mail report using Sendgrid
Dunst
The idea is straightforward: if I locate a machine running with an open port 2222 on the same subnet as the scraper is running in, a Dunst notification is sent via SSH client to this host.
from enum import Enum
from typing import List
from paramiko import ECDSAKey, SSHClient, AutoAddPolicy
from io import StringIO
from populoscraper.config import PopuloScraperConfig
from populoscraper.container_logging import log
from populoscraper.scraper.model import InterestingAvailableCourse, ScrapeSummary
class DunstNotificationSeverity(Enum):
LOW = "low"
NORMAL = "normal"
HIGH = "critical"
def send_dunst_notification(
config: PopuloScraperConfig,
interesting_courses: List[InterestingAvailableCourse],
host: str
) -> ScrapeSummary:
global ssh_client
try:
log.info(f"Will send a notification to host {host}.")
# key is provided in the PEM format from Vault
private_key = ECDSAKey.from_private_key(StringIO(config.private_ssh_key))
ssh_client = SSHClient()
ssh_client.set_missing_host_key_policy(AutoAddPolicy())
ssh_client.connect(hostname=host, username=config.ssh_username, pkey=private_key, port=config.ssh_port)
notification_body = "\n".join(
[
f"{c.start} - {c.schoolType}, {c.studyYear} tř."
+ "\n"
+ "\n".join([f"\t- {ts.weekDay}: {ts.startHour} - {ts.endHour}" for ts in c.timeSlots])
for c in interesting_courses
]
)
notification_command = _send_critical_notification_command_string(
notification_body=notification_body
)
log.info(f"Notification command: {notification_command}")
ssh_client.exec_command(notification_command)
return ScrapeSummary.DUNST_NOTIFICATION_SUCCESS
except Exception as e:
log.error(f"Error while sending notification: {e}")
return ScrapeSummary.DUNST_NOTIFICATION_FAILURE
finally:
if ssh_client:
ssh_client.close()
def _send_critical_notification_command_string(
notification_summary: str = "Nové zajímavé kurzy!",
notification_body: str = "",
severity: DunstNotificationSeverity = DunstNotificationSeverity.HIGH
) -> str:
return f'notify-send "{notification_summary}" "{notification_body}" -u {severity.value} -c {app_name} -a {app_name}'
In case there is no active host with port 2222, the scraper will fall back to sending a report which is basically an HTML table.
Nothing complicated, this is what it looks like: