Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:focal as app
FROM ubuntu:jammy as app

ARG PYTHON_VERSION=3.12

Expand Down
13 changes: 12 additions & 1 deletion course_discovery/apps/course_metadata/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,10 +880,21 @@ class UtilsTests(TestCase):
'<p>Some text</p>\n<p>· Item 1</p>\n<ul>\n<li>Item 2</li>\n</ul>\n<p>Regular paragraph</p>\n<p>· Item 3</p>'
)
)
@ddt.data(
(
'<p><em>The content of this course also forms part of the six-month online<a href="https://example.com">Example Link</a></em></p>', # pylint: disable=line-too-long
'<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>' # pylint: disable=line-too-long

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you clarify what is the difference between these two <p> tag lines? They look identical.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @ankit-sonata The difference between the two strings is that in the input string, there is no space before the tag (the space is missing), whereas in the expected output, a space is included.
I will correct the input string to reflect proper spacing.

),
(
'<div><p>online course.</p><p><strong>Module 1:</strong></p></div>',
'<p>online course. <strong>Module 1:</strong></p>'
)
)
@ddt.unpack
def test_clean_html(self, content, expected):
""" Verify the method removes unnecessary HTML attributes. """

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbalne why the docstring is removed maybe you want to update it ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

assert clean_html(content) == expected
result = clean_html(content)
assert result == expected, f"\nExpected:\n{expected}\nGot:\n{result}"

def test_skill_data_transformation(self):
category_data = {
Expand Down
15 changes: 9 additions & 6 deletions course_discovery/apps/course_metadata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,8 @@ def clean_html(content):
(indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved
or added to maintain consistency with the original content.
"""
if not content:
return ''
LIST_TAGS = ['ul', 'ol']
is_list_with_dir_attr_present = False

Expand All @@ -790,12 +792,13 @@ def clean_html(content):
cleaned = cleaned.replace('<p><b></b></p>', '')
html_converter = HTML2TextWithLangSpans(bodywidth=None)
html_converter.wrap_links = False
cleaned = html_converter.handle(cleaned).strip()
cleaned = markdown.markdown(cleaned)
for tag in LIST_TAGS:
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned

return cleaned
markdown_text = html_converter.handle(cleaned).strip()
cleaned = markdown.markdown(markdown_text)
cleaned = re.sub(r'([^\s>])\s*(<a\b)', r'\1 \2', cleaned)
if is_list_with_dir_attr_present:
for tag in LIST_TAGS:
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">')
return cleaned.strip()


def get_file_from_drive_link(image_url):
Expand Down
Loading