I'm trying to insert some data into a database, unfortunately it fails and does not save, I suspect my data is structured in a bad way. The data gets printed nicely (1 title, link and date per object) in process_item before attempting to save "print(title, link, date)", however it fails to save it. Title, link and date each holds 1 string...
Thankful for your help
Error:
"Traceback (most recent call last):
File "spider.py", line 63, in <module>
presstv = spider_html(presstv_url, presstv_extract_item, presstv_xpath, presstv_pipeline)
File "spider.py", line 58, in spider_html
pipeline.process_item(extract_function(element), None)
File "/Users/dav/Projects/python/news/pipeline.py", line 76, in process_item
if session.query(Presstv).filter_by(link=item['link']) == None:
TypeError: 'Presstv' object is not subscriptable"
Code
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from models import Nordfront, Presstv, db_connect, create_presstv_table
import json
class PresstvPipeline(object):
"""Pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_presstv_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, items, spider):
session = self.Session()
for title, link, date in zip(items['title'], items['link'], items['date']):
print(title, link, date)
item = Presstv(title = title, link = link, date = date)
if session.query(Presstv).filter_by(link=item['link']) == None:
try:
session.add(item)
session.commit()
logger.info('Item saved')
except:
session.rollback()
raise
finally:
session.close()
return item
Model:
class Presstv(DeclarativeBase):
"""Sqlalchemy deals model"""
__tablename__ = "presstv"
id = Column(Integer, primary_key=True)
title = Column('title', String)
description = Column('description', String, nullable=True)
link = Column('link', String, unique=True)
date = Column('date', String, nullable=True)
created_at = Column('created_at', DateTime, default=_get_date)
You should use:
if session.query(Presstv).filter_by(link=item.link) == None:
as item is now a object from SQLAlchemy. This probably happened because you are using items['link'] a few lines before that but item is now an instance of that class so you should access its values using .link.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With