[1/3] fixup! mm: alloc_contig_freed_pages() added

Message ID ea1bc31120e0670a044de6af7b3c67203c178065.1316617681.git.mina86@mina86.com (mailing list archive)
State RFC, archived
Headers

Commit Message

Michal Nazarewicz Sept. 21, 2011, 3:19 p.m. UTC
From: Michal Nazarewicz <mina86@mina86.com>

Signed-off-by: Michal Nazarewicz <mina86@mina86.com>

---
 include/linux/page-isolation.h |    4 ++-
 mm/page_alloc.c                |   66 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 60 insertions(+), 10 deletions(-)

> On Fri, 2011-08-19 at 16:27 +0200, Marek Szyprowski wrote:
>> +unsigned long alloc_contig_freed_pages(unsigned long start, unsigned  
>> long end,
>> +				       gfp_t flag)
>> +{
>> +	unsigned long pfn = start, count;
>> +	struct page *page;
>> +	struct zone *zone;
>> +	int order;
>> +
>> +	VM_BUG_ON(!pfn_valid(start));
>> +	zone = page_zone(pfn_to_page(start));

On Thu, 08 Sep 2011 20:05:52 +0200, Dave Hansen <dave@linux.vnet.ibm.com> wrote:
> This implies that start->end are entirely contained in a single zone.
> What enforces that?  If some higher layer enforces that, I think we
> probably need at least a VM_BUG_ON() in here and a comment about who
> enforces it.

>> +	spin_lock_irq(&zone->lock);
>> +
>> +	page = pfn_to_page(pfn);
>> +	for (;;) {
>> +		VM_BUG_ON(page_count(page) || !PageBuddy(page));
>> +		list_del(&page->lru);
>> +		order = page_order(page);
>> +		zone->free_area[order].nr_free--;
>> +		rmv_page_order(page);
>> +		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
>> +		pfn  += 1 << order;
>> +		if (pfn >= end)
>> +			break;
>> +		VM_BUG_ON(!pfn_valid(pfn));
>> +		page += 1 << order;
>> +	}

> This 'struct page *'++ stuff is OK, but only for small, aligned areas.
> For at least some of the sparsemem modes (non-VMEMMAP), you could walk
> off of the end of the section_mem_map[] when you cross a MAX_ORDER
> boundary.  I'd feel a little bit more comfortable if pfn_to_page() was
> being done each time, or only occasionally when you cross a section
> boundary.

Do the attached changes seem to make sense?

I wanted to avoid calling pfn_to_page() each time as it seem fairly
expensive in sparsemem and disctontig modes.  At the same time, the
macro trickery is so that users of sparsemem-vmemmap and flatmem won't
have to pay the price.
  

Comments

Dave Hansen Sept. 21, 2011, 3:45 p.m. UTC | #1
On Wed, 2011-09-21 at 17:19 +0200, Michal Nazarewicz wrote:
> Do the attached changes seem to make sense?

The logic looks OK.

> I wanted to avoid calling pfn_to_page() each time as it seem fairly
> expensive in sparsemem and disctontig modes.  At the same time, the
> macro trickery is so that users of sparsemem-vmemmap and flatmem won't
> have to pay the price.

Personally, I'd say the (incredibly minuscule) runtime cost is worth the
cost of making folks' eyes bleed when they see those macros.  I think
there are some nicer ways to do it.

Is there a reason you can't logically do?

	page = pfn_to_page(pfn);
	for (;;) {
		if (pfn_to_section_nr(pfn) == pfn_to_section_nr(pfn+1))
			page++;
		else
			page = pfn_to_page(pfn+1);
	}

pfn_to_section_nr() is a register shift.  Our smallest section size on
x86 is 128MB and on ppc64 16MB.  So, at *WORST* (64k pages on ppc64),
you're doing pfn_to_page() one of every 256 loops.

My suggestion would be put put a macro up in the sparsemem headers that
does something like:

#ifdef VMEMMAP
#define zone_pfn_same_memmap(pfn1, pfn2) (1)
#elif SPARSEMEM_OTHER
static inline int zone_pfn_same_memmap(unsigned long pfn1, unsigned long pfn2)
{
	return (pfn_to_section_nr(pfn1) == pfn_to_section_nr(pfn2));
}
#else
#define zone_pfn_same_memmap(pfn1, pfn2) (1)
#endif

The zone_ bit is necessary in the naming because DISCONTIGMEM's pfns are
at least contiguous within a zone.  Only the non-VMEMMAP sparsemem case
isn't.

Other folks would probably have a use for something like that.  Although
most of the previous users have gotten to this point, given up, and just
done pfn_to_page() on each loop. :)

> +#if defined(CONFIG_FLATMEM) || defined(CONFIG_SPARSEMEM_VMEMMAP)
> +
> +/*
> + * In FLATMEM and CONFIG_SPARSEMEM_VMEMMAP we can safely increment the page
> + * pointer and get the same value as if we were to get by calling
> + * pfn_to_page() on incremented pfn counter.
> + */
> +#define __contig_next_page(page, pageblock_left, pfn, increment) \
> +	((page) + (increment))
> +
> +#define __contig_first_page(pageblock_left, pfn) pfn_to_page(pfn)
> +
> +#else
> +
> +/*
> + * If we cross pageblock boundary, make sure we get a valid page pointer.  If
> + * we are within pageblock, incrementing the pointer is good enough, and is
> + * a bit of an optimisation.
> + */
> +#define __contig_next_page(page, pageblock_left, pfn, increment)	\
> +	(likely((pageblock_left) -= (increment)) ? (page) + (increment)	\
> +	 : (((pageblock_left) = pageblock_nr_pages), pfn_to_page(pfn)))
> +
> +#define __contig_first_page(pageblock_left, pfn) (			\
> +	((pageblock_left) = pageblock_nr_pages -			\
> +		 ((pfn) & (pageblock_nr_pages - 1))),			\
> +	pfn_to_page(pfn))
> +
> +
> +#endif

For the love of Pete, please make those in to functions if you're going
to keep them.  They're really unreadable like that.

You might also want to look at mm/internal.h's mem_map_offset() and
mem_map_next().  They're not _quite_ what you need, but they're close.

-- Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  

Patch

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index b2a81fd..003c52f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -46,11 +46,13 @@  static inline void unset_migratetype_isolate(struct page *page)
 {
 	__unset_migratetype_isolate(page, MIGRATE_MOVABLE);
 }
+
+/* The below functions must be run on a range from a single zone. */
 extern unsigned long alloc_contig_freed_pages(unsigned long start,
 					      unsigned long end, gfp_t flag);
 extern int alloc_contig_range(unsigned long start, unsigned long end,
 			      gfp_t flags, unsigned migratetype);
-extern void free_contig_pages(struct page *page, int nr_pages);
+extern void free_contig_pages(unsigned long pfn, unsigned nr_pages);
 
 /*
  * For migration.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 46e78d4..32fda5d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5716,9 +5716,41 @@  out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
+#if defined(CONFIG_FLATMEM) || defined(CONFIG_SPARSEMEM_VMEMMAP)
+
+/*
+ * In FLATMEM and CONFIG_SPARSEMEM_VMEMMAP we can safely increment the page
+ * pointer and get the same value as if we were to get by calling
+ * pfn_to_page() on incremented pfn counter.
+ */
+#define __contig_next_page(page, pageblock_left, pfn, increment) \
+	((page) + (increment))
+
+#define __contig_first_page(pageblock_left, pfn) pfn_to_page(pfn)
+
+#else
+
+/*
+ * If we cross pageblock boundary, make sure we get a valid page pointer.  If
+ * we are within pageblock, incrementing the pointer is good enough, and is
+ * a bit of an optimisation.
+ */
+#define __contig_next_page(page, pageblock_left, pfn, increment)	\
+	(likely((pageblock_left) -= (increment)) ? (page) + (increment)	\
+	 : (((pageblock_left) = pageblock_nr_pages), pfn_to_page(pfn)))
+
+#define __contig_first_page(pageblock_left, pfn) (			\
+	((pageblock_left) = pageblock_nr_pages -			\
+		 ((pfn) & (pageblock_nr_pages - 1))),			\
+	pfn_to_page(pfn))
+
+
+#endif
+
 unsigned long alloc_contig_freed_pages(unsigned long start, unsigned long end,
 				       gfp_t flag)
 {
+	unsigned long pageblock_left __unused;
 	unsigned long pfn = start, count;
 	struct page *page;
 	struct zone *zone;
@@ -5729,27 +5761,37 @@  unsigned long alloc_contig_freed_pages(unsigned long start, unsigned long end,
 
 	spin_lock_irq(&zone->lock);
 
-	page = pfn_to_page(pfn);
+	page = __contig_first_page(pageblock_left, pfn);
 	for (;;) {
-		VM_BUG_ON(page_count(page) || !PageBuddy(page));
+		VM_BUG_ON(!page_count(page) || !PageBuddy(page) ||
+			  page_zone(page) != zone);
+
 		list_del(&page->lru);
 		order = page_order(page);
+		count = 1UL << order;
 		zone->free_area[order].nr_free--;
 		rmv_page_order(page);
-		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
-		pfn  += 1 << order;
+		__mod_zone_page_state(zone, NR_FREE_PAGES, -(long)count);
+
+		pfn += count;
 		if (pfn >= end)
 			break;
 		VM_BUG_ON(!pfn_valid(pfn));
-		page += 1 << order;
+
+		page = __contig_next_page(page, pageblock_left, pfn, count);
 	}
 
 	spin_unlock_irq(&zone->lock);
 
 	/* After this, pages in the range can be freed one be one */
-	page = pfn_to_page(start);
-	for (count = pfn - start; count; --count, ++page)
+	count = pfn - start;
+	pfn = start;
+	page = __contig_first_page(pageblock_left, pfn);
+	for (; count; --count) {
 		prep_new_page(page, 0, flag);
+		++pfn;
+		page = __contig_next_page(page, pageblock_left, pfn, 1);
+	}
 
 	return pfn;
 }
@@ -5903,10 +5945,16 @@  done:
 	return ret;
 }
 
-void free_contig_pages(struct page *page, int nr_pages)
+void free_contig_pages(unsigned long pfn, unsigned nr_pages)
 {
-	for (; nr_pages; --nr_pages, ++page)
+	unsigned long pageblock_left __unused;
+	struct page *page = __contig_first_page(pageblock_left, pfn);
+
+	while (nr_pages--) {
 		__free_page(page);
+		++pfn;
+		page = __contig_next_page(page, pageblock_left, pfn, 1);
+	}
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE