Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/issue 213 #219

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions youtube_transcript_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@
FailedToCreateConsentCookie,
YouTubeRequestFailed,
InvalidVideoId,
VideoUnplayable,
LoginRequired
)
61 changes: 60 additions & 1 deletion youtube_transcript_api/_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class CouldNotRetrieveTranscript(Exception):
ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
CAUSE_MESSAGE = ''
REASON_MESSAGE = '{cause}: {reason}\n{subreason}'
GITHUB_REFERRAL = (
'\n\nIf you are sure that the described cause is not responsible for this error '
'and that a transcript should be retrievable, please create an issue at '
Expand All @@ -17,7 +18,8 @@ class CouldNotRetrieveTranscript(Exception):
'Also make sure that there are no open issues which already describe your problem!'
)

def __init__(self, video_id):
def __init__(self, video_id, playability=None):
self.playability = playability
self.video_id = video_id
super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message())

Expand All @@ -32,6 +34,14 @@ def _build_error_message(self):

@property
def cause(self):
if self.playability:
# if self.playability IS NOT None, use the playability error reason the API presented.

subreason = get_playability_subreason(self.playability)
return self.REASON_MESSAGE.format(
cause=self.CAUSE_MESSAGE,
reason=self.playability.get("reason"),
subreason=subreason)
return self.CAUSE_MESSAGE


Expand Down Expand Up @@ -100,6 +110,12 @@ class CookiesInvalid(CouldNotRetrieveTranscript):
class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'

class VideoUnplayable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Unplayable video'

class LoginRequired(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Login required'


class NoTranscriptFound(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = (
Expand All @@ -118,3 +134,46 @@ def cause(self):
requested_language_codes=self._requested_language_codes,
transcript_data=str(self._transcript_data),
)


def get_playability_error(playability_json):
"""
Using the json extracted from playabilityStatus,
returns a custom error based on the value of the "status" key.

Anything that is not {"status": "OK"} is likely an error.
"""
reason = playability_json.get("status")
if reason == 'LOGIN_REQUIRED':
# error for age related playability

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LOGIN_REQUIRED also happens when you need to "Sign in to confirm you’re not a bot". This simpler pull request addresses this: #337

return LoginRequired
elif reason == 'UNPLAYABLE':
# error for region/country lock playability
return VideoUnplayable
else:
# error fallback
return TranscriptsDisabled


def get_playability_subreason(playability_json):
"""
Traverses playability json nested struct to pick out the subreason, if any.
"""

# check for each nested keys and fail fast if they dont exist.
error_screen = playability_json.get("errorScreen")
if not error_screen:
return ""

renderer = error_screen.get("playerErrorMessageRenderer")
if not renderer:
return ""

subreason = renderer.get("subreason", dict()).get("runs", list())

if not subreason:
return ""

if len(subreason) > 0:
return subreason[0]['text']
return ""
crhowell marked this conversation as resolved.
Show resolved Hide resolved
27 changes: 23 additions & 4 deletions youtube_transcript_api/_transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
NoTranscriptAvailable,
FailedToCreateConsentCookie,
InvalidVideoId,
VideoUnplayable,
LoginRequired,
get_playability_error
)
from ._settings import WATCH_URL

Expand All @@ -50,17 +53,33 @@ def fetch(self, video_id):

def _extract_captions_json(self, html, video_id):
splitted_html = html.split('"captions":')

if len(splitted_html) <= 1:
if video_id.startswith('http://') or video_id.startswith('https://'):
raise InvalidVideoId(video_id)
if 'class="g-recaptcha"' in html:
raise TooManyRequests(video_id)
if '"playabilityStatus":' not in html:
raise VideoUnavailable(video_id)

raise TranscriptsDisabled(video_id)


# attempt to parse the playability reason from the html.
playability_splitted_html = html.split('"playabilityStatus":')
if len(playability_splitted_html) <= 1:
# if we didnt find "playabilityStatus" to split on, fallback.
raise TranscriptsDisabled(video_id)

# if we cannot split on videoDetails (a key after "playabilityStatus")
raw_details = playability_splitted_html[1].split(',"videoDetails')
if len(raw_details) <= 1:
raise TranscriptsDisabled(video_id)

playability_status_json = json.loads(
raw_details[0].replace('\n', '')
)

playability_error = get_playability_error(playability_status_json)
raise playability_error(video_id, playability_status_json)

# we were able to split on "captions":
captions_json = json.loads(
splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
).get('playerCaptionsTracklistRenderer')
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions youtube_transcript_api/test/test_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from unittest import TestCase
from mock import patch

Expand All @@ -21,6 +22,8 @@
FailedToCreateConsentCookie,
YouTubeRequestFailed,
InvalidVideoId,
VideoUnplayable,
LoginRequired
)


Expand Down Expand Up @@ -198,6 +201,26 @@ def test_get_transcript__exception_if_video_unavailable(self):
with self.assertRaises(VideoUnavailable):
YouTubeTranscriptApi.get_transcript('abc')

def test_get_transcript__exception_if_video_unplayable(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_video_unplayable.html.static')
)

with self.assertRaises(VideoUnplayable):
YouTubeTranscriptApi.get_transcript('kZsVStYdmws')

def test_get_transcript__exception_if_login_required(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_video_login_required.html.static')
)

with self.assertRaises(LoginRequired):
YouTubeTranscriptApi.get_transcript('4FN12sqoC4Y')

def test_get_transcript__exception_if_youtube_request_fails(self):
httpretty.register_uri(
httpretty.GET,
Expand Down