[1/2] videobuf2: Add a non-coherent contiguous DMA mode

Message ID	1310675711-39744-2-git-send-email-corbet@lwn.net (mailing list archive)
State	RFC, archived
Headers	Envelope-to: mchehab@infradead.org Delivery-date: Thu, 14 Jul 2011 20:35:54 +0000 From: Jonathan Corbet <corbet@lwn.net> To: linux-media@vger.kernel.org Cc: Marek Szyprowski <m.szyprowski@samsung.com>, Mauro Carvalho Chehab <mchehab@redhat.com>, Jonathan Corbet <corbet@lwn.net> Subject: [PATCH 1/2] videobuf2: Add a non-coherent contiguous DMA mode Date: Thu, 14 Jul 2011 14:35:10 -0600 Message-Id: <1310675711-39744-2-git-send-email-corbet@lwn.net> In-Reply-To: <1310675711-39744-1-git-send-email-corbet@lwn.net> References: <1310675711-39744-1-git-send-email-corbet@lwn.net> Sender: linux-media-owner@vger.kernel.org Precedence: bulk

Message ID

1310675711-39744-2-git-send-email-corbet@lwn.net (mailing list archive)

State

RFC, archived

Headers

Envelope-to: mchehab@infradead.org
Delivery-date: Thu, 14 Jul 2011 20:35:54 +0000
From: Jonathan Corbet <corbet@lwn.net>
To: linux-media@vger.kernel.org
Cc: Marek Szyprowski <m.szyprowski@samsung.com>,
	Mauro Carvalho Chehab <mchehab@redhat.com>,
	Jonathan Corbet <corbet@lwn.net>
Subject: [PATCH 1/2] videobuf2: Add a non-coherent contiguous DMA mode
Date: Thu, 14 Jul 2011 14:35:10 -0600
Message-Id: <1310675711-39744-2-git-send-email-corbet@lwn.net>
In-Reply-To: <1310675711-39744-1-git-send-email-corbet@lwn.net>
References: <1310675711-39744-1-git-send-email-corbet@lwn.net>
Sender: linux-media-owner@vger.kernel.org
Precedence: bulk

Commit Message

Jonathan Corbet July 14, 2011, 8:35 p.m. UTC

  This patch adds videobuf2-dma-nc.c and related stuff; it implements a new
contiguous DMA mode which uses non-coherent memory.  Going non-coherent can
improve performance greatly (3x frame rate increase over coherent on the
Marvell Armada 610 controller), relieves pressure on coherent memory pools,
and avoids possible page attribute conflict problems.  It also is easy to
support within the VB2 framework.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 drivers/media/video/Kconfig            |    5 ++
 drivers/media/video/Makefile           |    1 +
 drivers/media/video/videobuf2-dma-nc.c |  125 ++++++++++++++++++++++++++++++++
 include/media/videobuf2-dma-nc.h       |    9 +++
 4 files changed, 140 insertions(+), 0 deletions(-)
 create mode 100644 drivers/media/video/videobuf2-dma-nc.c
 create mode 100644 include/media/videobuf2-dma-nc.h

Comments

Marek Szyprowski July 15, 2011, 6:05 a.m. UTC | #1

Hello,

On Thursday, July 14, 2011 10:35 PM Jonathan Corbet wrote:

> This patch adds videobuf2-dma-nc.c and related stuff; it implements a new
> contiguous DMA mode which uses non-coherent memory.  Going non-coherent can
> improve performance greatly (3x frame rate increase over coherent on the
> Marvell Armada 610 controller), relieves pressure on coherent memory pools,
> and avoids possible page attribute conflict problems.  It also is easy to
> support within the VB2 framework.
> 
> Signed-off-by: Jonathan Corbet <corbet@lwn.net>
> ---
>  drivers/media/video/Kconfig            |    5 ++
>  drivers/media/video/Makefile           |    1 +
>  drivers/media/video/videobuf2-dma-nc.c |  125
> ++++++++++++++++++++++++++++++++
>  include/media/videobuf2-dma-nc.h       |    9 +++
>  4 files changed, 140 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/media/video/videobuf2-dma-nc.c
>  create mode 100644 include/media/videobuf2-dma-nc.h
> 
> diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig
> index 7c88e0c..92be525 100644
> --- a/drivers/media/video/Kconfig
> +++ b/drivers/media/video/Kconfig
> @@ -55,6 +55,11 @@ config VIDEOBUF2_DMA_CONTIG
>  	select VIDEOBUF2_MEMOPS
>  	tristate
> 
> +config VIDEOBUF2_DMA_NC
> +	select VIDEOBUF2_CORE
> +	select VIDEOBUF2_MEMOPS
> +	tristate
> +
>  config VIDEOBUF2_VMALLOC
>  	select VIDEOBUF2_CORE
>  	select VIDEOBUF2_MEMOPS
> diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
> index 5bb3b6e..11539d8 100644
> --- a/drivers/media/video/Makefile
> +++ b/drivers/media/video/Makefile
> @@ -118,6 +118,7 @@ obj-$(CONFIG_VIDEOBUF2_MEMOPS)		+= videobuf2-
> memops.o
>  obj-$(CONFIG_VIDEOBUF2_VMALLOC)		+= videobuf2-vmalloc.o
>  obj-$(CONFIG_VIDEOBUF2_DMA_CONTIG)	+= videobuf2-dma-contig.o
>  obj-$(CONFIG_VIDEOBUF2_DMA_SG)		+= videobuf2-dma-sg.o
> +obj-$(CONFIG_VIDEOBUF2_DMA_NC)		+= videobuf2-dma-nc.o
> 
>  obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o
> 
> diff --git a/drivers/media/video/videobuf2-dma-nc.c
> b/drivers/media/video/videobuf2-dma-nc.c
> new file mode 100644
> index 0000000..fecda72
> --- /dev/null
> +++ b/drivers/media/video/videobuf2-dma-nc.c
> @@ -0,0 +1,125 @@
> +/*
> + * This is a videobuf2 memory allocator for contiguous but noncoherent
> + * memory.  With luck, it will prove faster and will not strain
> + * the (possibly limited) supplies of coherent memory.
> + *
> + * Some constraints:
> + *
> + * - The userptr hack used by videobuf2-dma-contig is not supported
> + *   here; it is only usable by out-of-tree code anyway.
> + *
> + * - Drivers are charged with creating and tearing down the streaming
> + *   mappings, probably in their buf_prepare() and buf_finish() functions.
> + *
> + * Copyright 2011 Jonathan Corbet <corbet@lwn.net>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation.
> + */
> +#include <linux/module.h>
> +#include <linux/err.h>
> +#include <linux/slab.h>
> +#include <linux/atomic.h>
> +#include <linux/mm.h>
> +
> +#include <media/videobuf2-core.h>
> +#include <media/videobuf2-memops.h>
> +
> +
> +struct vb2_nc_buf {
> +	void	*vaddr;		/* The actual buffer */
> +	unsigned long size;
> +	atomic_t refcount;
> +	struct vb2_vmarea_handler vm_handler;
> +};
> +
> +/*
> + * Weird VB2 reference counting: the core increments the atomic_t
> + * count directly, but calls back to us to query or decrement it.
> + */
> +static void vb2_dma_nc_put(void *vbuf)
> +{
> +	struct vb2_nc_buf *buf = vbuf;
> +
> +	if (atomic_dec_and_test(&buf->refcount)) {
> +		free_pages_exact(buf->vaddr, buf->size);
> +		kfree(buf);
> +	}
> +}
> +
> +static unsigned int vb2_dma_nc_num_users(void *vbuf)
> +{
> +	struct vb2_nc_buf *buf = vbuf;
> +
> +	return atomic_read(&buf->refcount);
> +	/* Let's hope they don't fork() about now... */

This comment is really not needed here. vm_handler takes care of correct
reference
counting of the buffer when vma is duplicated (in case of operations like fork).

> +}
> +
> +/*
> + * Allocate a buffer and associated pages.
> + */
> +static void *vb2_dma_nc_alloc(void *alloc_ctx, unsigned long size)
> +{
> +	struct vb2_nc_buf *buf;
> +
> +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> +	if (!buf)
> +		return ERR_PTR(-ENOMEM);
> +	buf->vaddr = alloc_pages_exact(size, GFP_KERNEL);
> +	if (!buf->vaddr) {
> +		kfree(buf);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +	buf->size = size;
> +	atomic_set(&buf->refcount, 1);
> +
> +	buf->vm_handler.refcount = &buf->refcount;
> +	buf->vm_handler.put = vb2_dma_nc_put;
> +	buf->vm_handler.arg = buf;
> +	return buf;
> +}
> +
> +static void *vb2_dma_nc_vaddr(void *vbuf)
> +{
> +	struct vb2_nc_buf *buf = vbuf;
> +
> +	if (!buf)
> +		return NULL;
> +	return buf->vaddr;
> +}
> +
> +
> +static int vb2_dma_nc_mmap(void *vbuf, struct vm_area_struct *vma)
> +{
> +	struct vb2_nc_buf *buf = vbuf;
> +	unsigned long uaddr = vma->vm_start;
> +	void *vaddr;
> +	int ret;
> +
> +	if (!buf)
> +		return -EINVAL; /* Can this really happen? */
> +	for (vaddr = buf->vaddr; uaddr < vma->vm_end;
> +	     uaddr += PAGE_SIZE, vaddr += PAGE_SIZE) {
> +		ret = vm_insert_page(vma, uaddr, virt_to_page(vaddr));
> +		if (ret)
> +			return ret;  /* Undo partial mapping?? */
> +	}
> +	vma->vm_private_data = &buf->vm_handler;
> +	vma->vm_ops = &vb2_common_vm_ops;
> +	vma->vm_ops->open(vma);
> +	return 0;
> +}
> +
> +const struct vb2_mem_ops vb2_dma_nc_memops = {
> +	.alloc		= vb2_dma_nc_alloc,
> +	.put		= vb2_dma_nc_put,
> +	.vaddr		= vb2_dma_nc_vaddr,
> +	.mmap		= vb2_dma_nc_mmap,
> +	.num_users	= vb2_dma_nc_num_users
> +};
> +EXPORT_SYMBOL_GPL(vb2_dma_nc_memops);
> +
> +MODULE_DESCRIPTION("Non-coherent videobuf2 memory operations");
> +MODULE_AUTHOR("Jonathan Corbet");
> +MODULE_LICENSE("GPL");
> diff --git a/include/media/videobuf2-dma-nc.h b/include/media/videobuf2-
> dma-nc.h
> new file mode 100644
> index 0000000..a234dce
> --- /dev/null
> +++ b/include/media/videobuf2-dma-nc.h
> @@ -0,0 +1,9 @@
> +/*
> + * Noncoherent contiguous DMA support for videobuf2
> + */
> +#ifndef _MEDIA_VIDEOBUF2_DMA_NC_H
> +#define _MEDIA_VIDEOBUF2_DMA_NC_H
> +
> +extern const struct vb2_mem_ops vb2_dma_nc_memops;
> +
> +#endif

I would add a pointer to driver's struct device as an argument for this memory
allocator context (like it is done for dma_contig/coherent) and move
dma_map_single()
and dma_unmap_single() calls directly into the allocator. The driver needs only
to
call dma_sync_single_for_{cpu,device} in buf_prepare and buf_finish
respectively.

The allocator does it's job right, but I still have some concerns.
alloc_pages_exact()
are really not so reliable if system is running for a longer time and memory
gets
fragmented. This allocator also will not get any advantage of the IOMMU module
if such
is available (the allocator will still use one big chunk of physically
contiguous
memory block instead of allocating smaller chunks and mapping them contiguously
into
device io address space).

I plan to focus on these issues once I finish working on dma-mapping extensions.
My
idea is to introduce dma_alloc_attrs() function which will unify
dma_alloc_coherent,
dma_alloc_writecombine, dma_alloc_non-coherent and provide some additional 
functionality. This way the driver will be able to provide some attributes to 
control the properties of the memory. By default a standard COHERENT
(=non-cached)
memory will be provided, but we can have attributes for WRITECOMBINE memory and
NON-COHERENT (afair specific to some pci busses) and real CACHED memory (which
requires manual synchronization).

Having a common way of allocating a dma buffer enables us to use vb2-dma-contig 
allocator for all sub-types of the memory.

Best regards

Jonathan Corbet July 15, 2011, 2:30 p.m. UTC | #2

Hi, Marek,

Thanks for having a look.

> > +static unsigned int vb2_dma_nc_num_users(void *vbuf)
> > +{
> > +	struct vb2_nc_buf *buf = vbuf;
> > +
> > +	return atomic_read(&buf->refcount);
> > +	/* Let's hope they don't fork() about now... */
> 
> This comment is really not needed here. vm_handler takes care of correct
> reference
> counting of the buffer when vma is duplicated (in case of operations like fork).

I'm just a little worried about race conditions with this refcounting; what
keeps the count from changing between when you check it (for a value of
one, not zero) and when you act on it?  I mentioned fork() because it won't
care about any local locks.  But...if the count is <=1, that means, I
guess, that nobody has the buffer mapped, so fork() is not going to take
things.  So it's probably safe; I'll take the comment out.

> I would add a pointer to driver's struct device as an argument for this memory
> allocator context (like it is done for dma_contig/coherent) and move
> dma_map_single()
> and dma_unmap_single() calls directly into the allocator. The driver needs only
> to
> call dma_sync_single_for_{cpu,device} in buf_prepare and buf_finish
> respectively.

I had thought about doing that, but decided to mirror the scatter/gather
version, which pushes the mapping into the drivers.  I do think consistency
makes some sense; if the mapping is to be done in the memops code, dma-sg
should change.

The problem is that there's no convenient callback into the allocators
where the mapping and unmapping can be done now.  So I'd have had to add a
couple of memops to do that.  You *can't* do the mapping at allocation
time...  Rather than add more memops, I decided to do it in the driver,
where some there are callbacks that happen at the right times.

Would you rather I added the memops and did things that way?

> The allocator does it's job right, but I still have some concerns.
> alloc_pages_exact()
> are really not so reliable if system is running for a longer time and memory
> gets
> fragmented. 

Trust me, I'm well aware of that - though compaction and THP have made that
a whole lot better than it was.  The real point, though, is this:
alloc_pages_exact() is *way* more reliable than dma_alloc_coherent() on
some systems.

> This allocator also will not get any advantage of the IOMMU module
> if such
> is available (the allocator will still use one big chunk of physically
> contiguous
> memory block instead of allocating smaller chunks and mapping them contiguously
> into
> device io address space).

True, but the system I'm working on doesn't have a nice IOMMU like that.
Extending the allocator for such support doesn't seem like that hard a
thing to do, but I don't have the hardware to do it with.

> I plan to focus on these issues once I finish working on dma-mapping extensions.
> My
> idea is to introduce dma_alloc_attrs() function which will unify
> dma_alloc_coherent,
> dma_alloc_writecombine, dma_alloc_non-coherent and provide some additional 
> functionality. This way the driver will be able to provide some attributes to 
> control the properties of the memory. By default a standard COHERENT
> (=non-cached)
> memory will be provided, but we can have attributes for WRITECOMBINE memory and
> NON-COHERENT (afair specific to some pci busses) and real CACHED memory (which
> requires manual synchronization).
> 
> Having a common way of allocating a dma buffer enables us to use vb2-dma-contig 
> allocator for all sub-types of the memory.

I had thought about just extending dma-contig to support both modes, but I
didn't find enough common code there to make it worthwhile.  I'm not sure
how much value there is in mashing it all into one box; the drivers have to
be aware of the differences in each case.

Improving the DMA API makes sense - it's been fairly static for a long
time.  I also wouldn't expect any such changes to be merged anytime real
soon - you're playing in a lot of sensitive arch and mm playgrounds (I
suspect you've noticed that :).  

In the mean time I have an allocator which increases frame rates by a
factor of three on my hardware, one which could easily be ready for 3.2 (no
point in trying to rush it for 3.1, certainly).  Do I understand you to say
that you'd rather not see it go in?  My preference would be to merge it; we
can always make a switch if and when something better shows up elsewhere.
I doubt it will have accumulated a huge number of users.  What say you?

Thanks,

jon
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Marek Szyprowski July 18, 2011, 7 a.m. UTC | #3

Hello,

On Friday, July 15, 2011 4:30 PM Jonathan Corbet wrote:

> Hi, Marek,
> 
> Thanks for having a look.
> 
> > > +static unsigned int vb2_dma_nc_num_users(void *vbuf)
> > > +{
> > > +	struct vb2_nc_buf *buf = vbuf;
> > > +
> > > +	return atomic_read(&buf->refcount);
> > > +	/* Let's hope they don't fork() about now... */
> >
> > This comment is really not needed here. vm_handler takes care of correct
> > reference
> > counting of the buffer when vma is duplicated (in case of operations like
> fork).
> 
> I'm just a little worried about race conditions with this refcounting; what
> keeps the count from changing between when you check it (for a value of
> one, not zero) and when you act on it?  I mentioned fork() because it won't
> care about any local locks.  But...if the count is <=1, that means, I
> guess, that nobody has the buffer mapped, so fork() is not going to take
> things.  So it's probably safe; I'll take the comment out.
> 
> > I would add a pointer to driver's struct device as an argument for this
> memory
> > allocator context (like it is done for dma_contig/coherent) and move
> > dma_map_single()
> > and dma_unmap_single() calls directly into the allocator. The driver
> needs only
> > to
> > call dma_sync_single_for_{cpu,device} in buf_prepare and buf_finish
> > respectively.
> 
> I had thought about doing that, but decided to mirror the scatter/gather
> version, which pushes the mapping into the drivers.  I do think consistency
> makes some sense; if the mapping is to be done in the memops code, dma-sg
> should change.
> 
> The problem is that there's no convenient callback into the allocators
> where the mapping and unmapping can be done now.  So I'd have had to add a
> couple of memops to do that.

I think that some additional callbacks for allocators for synchronization
buffer state will be required sooner or later anyway, so imho it is better
to add them now to avoid massive fixing the drivers in the future.

>  You *can't* do the mapping at allocation time...

Could you elaborate why you can't create the mapping at allocation time? 
DMA-mapping api requires the following call sequence:
dma_map_single()
...
dma_sync_single_for_cpu()
dma_sync_single_for_device()
...
dma_unmap_single()

I see no problem to call dma_map_single() on buffer creation and 
dma_unmap_single() on release. dma_sync_single_for_{device,cpu} can
be used on buffer_{prepare,finish}.

>  Rather than add more memops, I decided to do it in the driver,
> where some there are callbacks that happen at the right times.
> 
> Would you rather I added the memops and did things that way?

Scatter-gather allocator was our first meeting with dma-mapping api, now I
noticed we will need to cleanup it a bit.

> > The allocator does it's job right, but I still have some concerns.
> > alloc_pages_exact()
> > are really not so reliable if system is running for a longer time and
> memory
> > gets
> > fragmented.
> 
> Trust me, I'm well aware of that - though compaction and THP have made that
> a whole lot better than it was.  The real point, though, is this:
> alloc_pages_exact() is *way* more reliable than dma_alloc_coherent() on
> some systems.

OK.

> > This allocator also will not get any advantage of the IOMMU module
> > if such
> > is available (the allocator will still use one big chunk of physically
> > contiguous
> > memory block instead of allocating smaller chunks and mapping them
> contiguously
> > into
> > device io address space).
> 
> True, but the system I'm working on doesn't have a nice IOMMU like that.
> Extending the allocator for such support doesn't seem like that hard a
> thing to do, but I don't have the hardware to do it with.
> 
> > I plan to focus on these issues once I finish working on dma-mapping
> extensions.
> > My
> > idea is to introduce dma_alloc_attrs() function which will unify
> > dma_alloc_coherent,
> > dma_alloc_writecombine, dma_alloc_non-coherent and provide some
> additional
> > functionality. This way the driver will be able to provide some
> attributes to
> > control the properties of the memory. By default a standard COHERENT
> > (=non-cached)
> > memory will be provided, but we can have attributes for WRITECOMBINE
> memory and
> > NON-COHERENT (afair specific to some pci busses) and real CACHED memory
> (which
> > requires manual synchronization).
> >
> > Having a common way of allocating a dma buffer enables us to use vb2-dma-
> contig
> > allocator for all sub-types of the memory.
> 
> I had thought about just extending dma-contig to support both modes, but I
> didn't find enough common code there to make it worthwhile.  I'm not sure
> how much value there is in mashing it all into one box; the drivers have to
> be aware of the differences in each case.
> 
> Improving the DMA API makes sense - it's been fairly static for a long
> time.  I also wouldn't expect any such changes to be merged anytime real
> soon - you're playing in a lot of sensitive arch and mm playgrounds (I
> suspect you've noticed that :).

Yes, I'm aware of that.

> In the mean time I have an allocator which increases frame rates by a
> factor of three on my hardware, one which could easily be ready for 3.2 (no
> point in trying to rush it for 3.1, certainly).  Do I understand you to say
> that you'd rather not see it go in?  My preference would be to merge it; we
> can always make a switch if and when something better shows up elsewhere.
> I doubt it will have accumulated a huge number of users.  What say you?

I just wanted to point out that it will be a temporary solution until
dma-mapping
and vb2-dma-contig allocator are extended with new features, but I see no
problems
to merge it now. It gives significant performance gain that definitely is worth
it.
The only problem is the name of the allocator. We probably shouldn't reuse names
that have different meaning in other related frameworks. non-coherent memory in
dma-mapping api is reserved for some special type of memory which has single
synchronization call, so the "non-coherent" might be confusing.

Best regards

Jonathan Corbet July 22, 2011, 7:55 p.m. UTC | #4

> > The problem is that there's no convenient callback into the allocators
> > where the mapping and unmapping can be done now.  So I'd have had to add a
> > couple of memops to do that.
> 
> I think that some additional callbacks for allocators for synchronization
> buffer state will be required sooner or later anyway, so imho it is better
> to add them now to avoid massive fixing the drivers in the future.

OK, I can certainly do a version of the patch along those lines.  I'd
envision some sort of give_buffer_to_device() and give_buffer_to_cpu()
calls (with better names).  It'll take me a little while to get it done,
though - travel and such are upon me.

> >  You *can't* do the mapping at allocation time...
> 
> Could you elaborate why you can't create the mapping at allocation time? 
> DMA-mapping api requires the following call sequence:
> dma_map_single()
> ...
> dma_sync_single_for_cpu()
> dma_sync_single_for_device()
> ...
> dma_unmap_single()
> 
> I see no problem to call dma_map_single() on buffer creation and 
> dma_unmap_single() on release. dma_sync_single_for_{device,cpu} can
> be used on buffer_{prepare,finish}.

Yes, it could be done that way.  I guess I've always, rightly or wrongly,
seen streaming mappings as transient things that aren't meant to be kept
around for long periods of time.  Especially if they might, somehow, be
taking up limited resources like IOMMU slots.  But I honestly have no idea
whether it's better to keep a set of mappings around and use the sync
functions, or whether it's better to remake them each time.

> The only problem is the name of the allocator. We probably shouldn't
> reuse names that have different meaning in other related
> frameworks. non-coherent memory in dma-mapping api is reserved for some
> special type of memory which has single synchronization call, so the
> "non-coherent" might be confusing.

I'm certainly not tied to the name - got a better idea?  The module uses
streaming mappings, but, somehow, reusing "streaming" in this context
doesn't seem likely to make things clearer...:)

Thanks,

jon
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Sakari Ailus July 23, 2011, 5:48 a.m. UTC | #5

Hi Jonathan,

On Fri, Jul 22, 2011 at 01:55:47PM -0600, Jonathan Corbet wrote:
> > >  You *can't* do the mapping at allocation time...
> > 
> > Could you elaborate why you can't create the mapping at allocation time? 
> > DMA-mapping api requires the following call sequence:
> > dma_map_single()
> > ...
> > dma_sync_single_for_cpu()
> > dma_sync_single_for_device()
> > ...
> > dma_unmap_single()
> > 
> > I see no problem to call dma_map_single() on buffer creation and 
> > dma_unmap_single() on release. dma_sync_single_for_{device,cpu} can
> > be used on buffer_{prepare,finish}.
> 
> Yes, it could be done that way.  I guess I've always, rightly or wrongly,
> seen streaming mappings as transient things that aren't meant to be kept
> around for long periods of time.  Especially if they might, somehow, be
> taking up limited resources like IOMMU slots.  But I honestly have no idea
> whether it's better to keep a set of mappings around and use the sync
> functions, or whether it's better to remake them each time.

Creating IOMMU mappings (and removing them) usually takes considerable
amount of time but usually consume practically no resources, so they are
kept while the buffers are pinned to system memory.

Do you have hardware which has limitations on IOMMU mappings?

For example, the OMA 3 IOMMU can be used to map the whole system memory (if
you need it) and it does page tabe walking, too.

Mauro Carvalho Chehab Sept. 3, 2011, 1:51 p.m. UTC | #6

Em 22-07-2011 16:55, Jonathan Corbet escreveu:
>>> The problem is that there's no convenient callback into the allocators
>>> where the mapping and unmapping can be done now.  So I'd have had to add a
>>> couple of memops to do that.
>>
>> I think that some additional callbacks for allocators for synchronization
>> buffer state will be required sooner or later anyway, so imho it is better
>> to add them now to avoid massive fixing the drivers in the future.
> 
> OK, I can certainly do a version of the patch along those lines.  I'd
> envision some sort of give_buffer_to_device() and give_buffer_to_cpu()
> calls (with better names).  It'll take me a little while to get it done,
> though - travel and such are upon me.

Hi Jon,

I'm understanding that I can just mark those two patches as RFC at patchwork
and wait for your version two of those patchsets.

So, I'll just remove those two from my pending queue and move on ;)

Thanks,
mauro
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jonathan Corbet Sept. 3, 2011, 1:58 p.m. UTC | #7

On Sat, 03 Sep 2011 10:51:53 -0300
Mauro Carvalho Chehab <mchehab@redhat.com> wrote:

> I'm understanding that I can just mark those two patches as RFC at patchwork
> and wait for your version two of those patchsets.
> 
> So, I'll just remove those two from my pending queue and move on ;)

That's the right thing to do.  I'm hoping to return to a more rational
schedule after Plumbers and catch up with a few of these things...

Thanks,

jon
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

diff mbox

Patch

diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig
index 7c88e0c..92be525 100644
--- a/drivers/media/video/Kconfig
+++ b/drivers/media/video/Kconfig
@@ -55,6 +55,11 @@  config VIDEOBUF2_DMA_CONTIG
 	select VIDEOBUF2_MEMOPS
 	tristate
 
+config VIDEOBUF2_DMA_NC
+	select VIDEOBUF2_CORE
+	select VIDEOBUF2_MEMOPS
+	tristate
+
 config VIDEOBUF2_VMALLOC
 	select VIDEOBUF2_CORE
 	select VIDEOBUF2_MEMOPS
diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
index 5bb3b6e..11539d8 100644
--- a/drivers/media/video/Makefile
+++ b/drivers/media/video/Makefile
@@ -118,6 +118,7 @@  obj-$(CONFIG_VIDEOBUF2_MEMOPS)		+= videobuf2-memops.o
 obj-$(CONFIG_VIDEOBUF2_VMALLOC)		+= videobuf2-vmalloc.o
 obj-$(CONFIG_VIDEOBUF2_DMA_CONTIG)	+= videobuf2-dma-contig.o
 obj-$(CONFIG_VIDEOBUF2_DMA_SG)		+= videobuf2-dma-sg.o
+obj-$(CONFIG_VIDEOBUF2_DMA_NC)		+= videobuf2-dma-nc.o
 
 obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o
 
diff --git a/drivers/media/video/videobuf2-dma-nc.c b/drivers/media/video/videobuf2-dma-nc.c
new file mode 100644
index 0000000..fecda72
--- /dev/null
+++ b/drivers/media/video/videobuf2-dma-nc.c
@@ -0,0 +1,125 @@ 
+/*
+ * This is a videobuf2 memory allocator for contiguous but noncoherent
+ * memory.  With luck, it will prove faster and will not strain
+ * the (possibly limited) supplies of coherent memory.
+ *
+ * Some constraints:
+ *
+ * - The userptr hack used by videobuf2-dma-contig is not supported
+ *   here; it is only usable by out-of-tree code anyway.
+ *
+ * - Drivers are charged with creating and tearing down the streaming
+ *   mappings, probably in their buf_prepare() and buf_finish() functions.
+ *
+ * Copyright 2011 Jonathan Corbet <corbet@lwn.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <linux/mm.h>
+
+#include <media/videobuf2-core.h>
+#include <media/videobuf2-memops.h>
+
+
+struct vb2_nc_buf {
+	void	*vaddr;		/* The actual buffer */
+	unsigned long size;
+	atomic_t refcount;
+	struct vb2_vmarea_handler vm_handler;
+};
+
+/*
+ * Weird VB2 reference counting: the core increments the atomic_t
+ * count directly, but calls back to us to query or decrement it.
+ */
+static void vb2_dma_nc_put(void *vbuf)
+{
+	struct vb2_nc_buf *buf = vbuf;
+
+	if (atomic_dec_and_test(&buf->refcount)) {
+		free_pages_exact(buf->vaddr, buf->size);
+		kfree(buf);
+	}
+}
+
+static unsigned int vb2_dma_nc_num_users(void *vbuf)
+{
+	struct vb2_nc_buf *buf = vbuf;
+
+	return atomic_read(&buf->refcount);
+	/* Let's hope they don't fork() about now... */
+}
+
+/*
+ * Allocate a buffer and associated pages.
+ */
+static void *vb2_dma_nc_alloc(void *alloc_ctx, unsigned long size)
+{
+	struct vb2_nc_buf *buf;
+
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+	buf->vaddr = alloc_pages_exact(size, GFP_KERNEL);
+	if (!buf->vaddr) {
+		kfree(buf);
+		return ERR_PTR(-ENOMEM);
+	}
+	buf->size = size;
+	atomic_set(&buf->refcount, 1);
+
+	buf->vm_handler.refcount = &buf->refcount;
+	buf->vm_handler.put = vb2_dma_nc_put;
+	buf->vm_handler.arg = buf;
+	return buf;
+}
+
+static void *vb2_dma_nc_vaddr(void *vbuf)
+{
+	struct vb2_nc_buf *buf = vbuf;
+
+	if (!buf)
+		return NULL;
+	return buf->vaddr;
+}
+
+
+static int vb2_dma_nc_mmap(void *vbuf, struct vm_area_struct *vma)
+{
+	struct vb2_nc_buf *buf = vbuf;
+	unsigned long uaddr = vma->vm_start;
+	void *vaddr;
+	int ret;
+
+	if (!buf)
+		return -EINVAL; /* Can this really happen? */
+	for (vaddr = buf->vaddr; uaddr < vma->vm_end;
+	     uaddr += PAGE_SIZE, vaddr += PAGE_SIZE) {
+		ret = vm_insert_page(vma, uaddr, virt_to_page(vaddr));
+		if (ret)
+			return ret;  /* Undo partial mapping?? */
+	}
+	vma->vm_private_data = &buf->vm_handler;
+	vma->vm_ops = &vb2_common_vm_ops;
+	vma->vm_ops->open(vma);
+	return 0;
+}
+
+const struct vb2_mem_ops vb2_dma_nc_memops = {
+	.alloc		= vb2_dma_nc_alloc,
+	.put		= vb2_dma_nc_put,
+	.vaddr		= vb2_dma_nc_vaddr,
+	.mmap		= vb2_dma_nc_mmap,
+	.num_users	= vb2_dma_nc_num_users
+};
+EXPORT_SYMBOL_GPL(vb2_dma_nc_memops);
+
+MODULE_DESCRIPTION("Non-coherent videobuf2 memory operations");
+MODULE_AUTHOR("Jonathan Corbet");
+MODULE_LICENSE("GPL");
diff --git a/include/media/videobuf2-dma-nc.h b/include/media/videobuf2-dma-nc.h
new file mode 100644
index 0000000..a234dce
--- /dev/null
+++ b/include/media/videobuf2-dma-nc.h
@@ -0,0 +1,9 @@ 
+/*
+ * Noncoherent contiguous DMA support for videobuf2
+ */
+#ifndef _MEDIA_VIDEOBUF2_DMA_NC_H
+#define _MEDIA_VIDEOBUF2_DMA_NC_H
+
+extern const struct vb2_mem_ops vb2_dma_nc_memops;
+
+#endif