I'm extracting data of articles in a website and pars them in my db in a django project using celery and bs4.Here is article model:
articles/model.py
from django.db import models
from conduit.apps.core.models import TimestampedModel
class Article(TimestampedModel):
slug = models.SlugField(db_index=True, max_length=255, unique=True)
title = models.CharField(db_index=True, max_length=255)
description = models.TextField()
body = models.TextField()
# Every article must have an author. This will answer questions like "Who
# gets credit for writing this article?" and "Who can edit this article?".
# Unlike the `User` <-> `Profile` relationship, this is a simple foreign
# key (or one-to-many) relationship. In this case, one `Profile` can have
# many `Article`s.
author = models.ForeignKey(
'profiles.Profile', on_delete=models.CASCADE, related_name='articles'
)
tags = models.ManyToManyField(
'articles.Tag', related_name='articles'
)
def __str__(self):
return self.title
class Comment(TimestampedModel):
body = models.TextField()
article = models.ForeignKey(
'articles.Article', related_name='comments', on_delete=models.CASCADE
)
author = models.ForeignKey(
'profiles.Profile', related_name='comments', on_delete=models.CASCADE
)
class Tag(TimestampedModel):
tag = models.CharField(max_length=255)
slug = models.SlugField(db_index=True, unique=True)
def __str__(self):
return self.tag
profile/model.py
from django.db import models
from conduit.apps.core.models import TimestampedModel
class MyUser(models.Model):
username = models.CharField(max_length=255)
class Profile(TimestampedModel):
user = models.ForeignKey(
MyUser, on_delete=models.CASCADE
)
bio = models.TextField(blank=True)
image = models.URLField(blank=True)
follows = models.ManyToManyField(
'self',
related_name='followed_by',
symmetrical=False
)
favorites = models.ManyToManyField(
'articles.Article',
related_name='favorited_by'
)
def __str__(self):
return self.user.username
def follow(self, profile):
"""Follow `profile` if we're not already following `profile`."""
self.follows.add(profile)
def unfollow(self, profile):
"""Unfollow `profile` if we're already following `profile`."""
self.follows.remove(profile)
def is_following(self, profile):
"""Returns True if we're following `profile`; False otherwise."""
return self.follows.filter(pk=profile.pk).exists()
def is_followed_by(self, profile):
"""Returns True if `profile` is following us; False otherwise."""
return self.followed_by.filter(pk=profile.pk).exists()
def favorite(self, article):
"""Favorite `article` if we haven't already favorited it."""
self.favorites.add(article)
def unfavorite(self, article):
"""Unfavorite `article` if we've already favorited it."""
self.favorites.remove(article)
def has_favorited(self, article):
"""Returns True if we have favorited `article`; else False."""
return self.favorites.filter(pk=article.pk).exists()
and here is the tasks which I added to extract data. I used this tutorial:
https://soshace.com/creating-real-time-api-with-beautiful-soup-and-django-rest-framework/
articles/tasks.py
from time import sleep
from celery import shared_task
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from .models import Article
from ..profiles.models import MyUser, Profile
@shared_task
def find_article():
url = 'https://...'
r = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(r).read()
soup = BeautifulSoup(html, 'html.parser')
stories = soup.find_all('div', {'class': [
'col u-xs-size12of12 js-trackPostPresentation u-paddingLeft12 u-marginBottom15 u-paddingRight12 u-size6of12',
'col u-xs-size12of12 js-trackPostPresentation u-paddingLeft12 u-marginBottom15 u-paddingRight12 u-size4of12']})
for story in stories:
title = story.find('h3').text if story.find('h3') else '-'
description = story.find('div', {
'class': 'u-fontSize18 u-letterSpacingTight u-lineHeightTight u-marginTop7 u-textColorNormal u-baseColor--textNormal'}).text if story.find(
'div', {
'class': 'u-fontSize18 u-letterSpacingTight u-lineHeightTight u-marginTop7 u-textColorNormal u-baseColor--textNormal'}) else '-'
slug = story.find('a')['href'].split('/')[-1]
author = story.find('a', {
'class': 'ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken'})[
'href'].split('@')[-1]
story_url = story.find('a')['href']
story_page = Request(story_url, headers={'User-Agent': 'Mozilla/5.0'})
story_html = urlopen(story_page).read()
story_soup = BeautifulSoup(story_html, 'html.parser')
sections = story_soup.find_all('section')
for section in sections:
body = section.find('p')
tags = section.find('p')
my_user = MyUser(username=author)
my_user.save()
my_user_profile = Profile(user=my_user.id)
my_user_profile.save()
Article.objects.create(
slug=slug,
title=title,
description=description,
body=body,
author=my_user_profile.id,
tags=tags
)
sleep(5)
find_article()
I get this error when I type in Terminal: celery -A "projectname" worker -l info(according to tutorial):
ValueError: Cannot assign "1": "Profile.user" must be a "MyUser" instance.
I'm a beginner and I can't fix it.
question from:
https://stackoverflow.com/questions/65921472/extracting-data-with-celery-and-beautifulsoup